diff --git a/.github/workflows/benchmark-memory-showdown.yml b/.github/workflows/benchmark-memory-showdown.yml new file mode 100644 index 00000000..224968a8 --- /dev/null +++ b/.github/workflows/benchmark-memory-showdown.yml @@ -0,0 +1,113 @@ +name: Temporal memory showdown + +"on": + workflow_dispatch: + inputs: + repeats: + description: Measured query repetitions after warm-up + required: false + default: "5" + +permissions: + contents: read + +jobs: + benchmark: + runs-on: ubuntu-latest + timeout-minutes: 90 + env: + MEM0_TELEMETRY: "false" + BENCH_HOME: /tmp/temporal-memory-showdown + SETUPTOOLS_SCM_PRETEND_VERSION: "0.0.0" + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + + - name: Install benchmark + run: | + python -m pip install -e . + python -m pip install -r examples/benchmarks/temporal-memory-showdown/requirements.txt + + - name: Configure Moorcheh On-Prem + env: + HOME: ${{ env.BENCH_HOME }} + run: | + python - <<'PY' + from moorcheh.user_config import EmbeddingConfig, LlmConfig, save_runtime_config + + save_runtime_config( + EmbeddingConfig(provider="ollama", model="nomic-embed-text"), + LlmConfig(provider="ollama", model="qwen2.5:1.5b"), + ) + PY + python -m moorcheh up \ + --bundled-ollama \ + --embedding-provider ollama \ + --embedding-model nomic-embed-text + + - name: Wait for services + env: + HOME: ${{ env.BENCH_HOME }} + run: | + for attempt in $(seq 1 60); do + if curl --fail --silent http://127.0.0.1:8080/health >/dev/null; then + break + fi + sleep 2 + done + curl --fail --silent http://127.0.0.1:8080/health >/dev/null + + # The HTTP health endpoint can become ready before namespace storage. + for attempt in $(seq 1 30); do + if python -m moorcheh namespace-create \ + --name benchmark-readiness \ + --type text; then + exit 0 + fi + if python -m moorcheh namespace-list | grep -q benchmark-readiness; then + exit 0 + fi + sleep 2 + done + docker logs moorcheh-onprem-server + exit 1 + + - name: Run live benchmark + env: + HOME: ${{ env.BENCH_HOME }} + run: | + python examples/benchmarks/temporal-memory-showdown/run_benchmark.py \ + --backends memanto,mem0-direct,mem0-agentic \ + --repeats "${{ inputs.repeats }}" + + - name: Collect service logs + if: always() + run: | + mkdir -p examples/benchmarks/temporal-memory-showdown/results/logs + docker logs moorcheh-onprem-server \ + > examples/benchmarks/temporal-memory-showdown/results/logs/moorcheh.log 2>&1 || true + docker logs moorcheh-ollama \ + > examples/benchmarks/temporal-memory-showdown/results/logs/ollama.log 2>&1 || true + docker inspect moorcheh-onprem-server \ + > examples/benchmarks/temporal-memory-showdown/results/logs/server-inspect.json 2>&1 || true + find "$BENCH_HOME/.moorcheh" -maxdepth 3 -printf "%M %u:%g %s %p\n" \ + > examples/benchmarks/temporal-memory-showdown/results/logs/data-layout.txt 2>&1 || true + + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: temporal-memory-showdown-results + path: examples/benchmarks/temporal-memory-showdown/results/ + if-no-files-found: warn + + - name: Stop services + if: always() + env: + HOME: ${{ env.BENCH_HOME }} + run: python -m moorcheh down --bundled-ollama || true diff --git a/examples/benchmarks/temporal-memory-showdown/.gitignore b/examples/benchmarks/temporal-memory-showdown/.gitignore new file mode 100644 index 00000000..1f3e17ca --- /dev/null +++ b/examples/benchmarks/temporal-memory-showdown/.gitignore @@ -0,0 +1,4 @@ +.benchmark-data/ +results/latest.json +results/latest.md +results/logs/ diff --git a/examples/benchmarks/temporal-memory-showdown/README.md b/examples/benchmarks/temporal-memory-showdown/README.md new file mode 100644 index 00000000..cee0f3e9 --- /dev/null +++ b/examples/benchmarks/temporal-memory-showdown/README.md @@ -0,0 +1,150 @@ +# Temporal Memory Showdown + +A live, reproducible benchmark of actual Memanto On-Prem and actual Mem0 OSS. +It tests a long-running research mission whose facts change across ten sessions. +The benchmark is designed to answer two separate questions: + +1. How do the retrieval layers compare when both store the same raw memories? +2. What changes when Mem0's normal LLM extraction and reconciliation pipeline is enabled? + +No backend is simulated. No LLM judges the results. + +## Systems under test + +| Backend | Storage path | Ingestion mode | +| --- | --- | --- | +| `memanto-on-prem` | Memanto `SdkClient` -> Moorcheh On-Prem | Typed `remember()` calls | +| `mem0-direct` | Mem0 2.0.5 -> local Qdrant | `infer=False`, raw memory | +| `mem0-agentic` | Mem0 2.0.5 -> local Qdrant | `infer=True`, Ollama extraction | + +All three use `nomic-embed-text` through the same Ollama service. The agentic +Mem0 run uses `qwen2.5:1.5b`; its native Ollama input and output token counters +are captured without estimating them. + +## Dataset + +The synthetic Asteria mission contains 32 records across ten sessions. It has +eleven explicit state changes, including: + +- crop: Genovese basil -> dwarf radish +- launch: August 14 -> September 2 +- commander: Elena Park -> Priya Nair +- channel: Slack -> Matrix +- nutrient protocol: N-17 / pH 6.2 -> N-21 / pH 5.9 +- landing site: Malapert Ridge -> Shackleton rim +- vendor: Helios / PO-81 -> Nova / PO-96 +- valve procedure: V1 -> V3 + +The 18 golden queries cover current state, history, and multi-hop briefs. +Every answer is scored with required and forbidden concept groups. This makes +the result deterministic and exposes stale-value leakage directly. + +## Metrics + +- deterministic required-concept coverage +- exact query accuracy +- stale-value leak rate +- source and retrieved context tokens (`cl100k_base` accounting unit) +- native Ollama extraction tokens for agentic Mem0 +- ingestion total, p50, and p95 latency +- time until the final update becomes searchable +- query mean, p50, and p95 latency after warm-up +- client process RSS delta +- paired bootstrap 95% confidence interval for coverage differences + +## Reproduce + +Prerequisites: Python 3.10+, Docker with Compose, and enough disk space for +`nomic-embed-text` plus `qwen2.5:1.5b`. + +```bash +python -m venv .venv +source .venv/bin/activate +pip install -e . +pip install -r examples/benchmarks/temporal-memory-showdown/requirements.txt +``` + +Configure Moorcheh to use the same local Ollama models: + +```bash +python - <<'PY' +from moorcheh.user_config import ( + EmbeddingConfig, + LlmConfig, + save_runtime_config, +) + +save_runtime_config( + EmbeddingConfig(provider="ollama", model="nomic-embed-text"), + LlmConfig(provider="ollama", model="qwen2.5:1.5b"), +) +PY + +python -m moorcheh up \ + --bundled-ollama \ + --embedding-provider ollama \ + --embedding-model nomic-embed-text +``` + +Run all systems: + +```bash +python examples/benchmarks/temporal-memory-showdown/run_benchmark.py \ + --backends memanto,mem0-direct,mem0-agentic \ + --repeats 5 +``` + +The runner writes machine-readable JSON and an audit-friendly Markdown table to +`results/latest.json` and `results/latest.md`. + +Run only deterministic unit tests: + +```bash +pytest examples/benchmarks/temporal-memory-showdown/tests -q +``` + +## Verified live result + +The committed result was produced by +[GitHub Actions run 27441595257](https://github.com/2077196405-commits/memanto/actions/runs/27441595257) +on June 12, 2026, using a four-core Ubuntu runner: + +| Metric | Memanto On-Prem | Mem0 agentic | +| --- | ---: | ---: | +| Golden concept coverage | 97.2% | 69.4% | +| Total ingestion time | 0.096s | 2912.082s | +| Query p95 | 0.0983s | 0.1032s | +| Retrieved context tokens | 1779 | 1793 | +| Extraction LLM tokens | 0 | 134,690 | + +The paired coverage advantage is 27.8 percentage points, with a bootstrap 95% +confidence interval of 9.3 to 48.1 points. Memanto completed ingestion about +30,286 times faster while avoiding all extraction-model tokens. + +`mem0-direct` reached 98.6% coverage, but it deliberately disables Mem0's +normal extraction and reconciliation (`infer=False`). It is included as a +vector-only ablation, not the primary agentic competitor. + +The run also exposed a limitation worth keeping visible: raw top-five context +from every backend can contain superseded values. The report therefore +separates required-concept coverage from strict contradiction-free accuracy +instead of hiding stale-value leakage. + +## Experimental controls + +- same records, order, queries, and `top_k=5` +- same embedding model and Ollama service +- fresh Memanto agent and fresh Mem0 collection per run +- one warm-up query pass before measured latency samples +- first measured pass used for accuracy and context-token totals +- no answer-generation model and no LLM-as-a-judge +- fixed bootstrap seed and fixed tokenizer accounting unit + +## Interpretation limits + +The benchmark measures retrieval context, not final answer quality. Memanto's +server runs in Docker while Mem0's Qdrant runs in the Python process, so client +RSS is reported but is not treated as a total-system memory comparison. +`cl100k_base` is a stable cross-system accounting unit, not the native embedding +tokenizer. Exact internal LLM tokens are reported only where Ollama exposes +them. diff --git a/examples/benchmarks/temporal-memory-showdown/backends.py b/examples/benchmarks/temporal-memory-showdown/backends.py new file mode 100644 index 00000000..595f17a3 --- /dev/null +++ b/examples/benchmarks/temporal-memory-showdown/backends.py @@ -0,0 +1,302 @@ +"""Live adapters for Memanto On-Prem and Mem0 OSS.""" + +from __future__ import annotations + +import os +import shutil +import time +import uuid +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Protocol + +from dataset import MemoryRecord + + +@dataclass(frozen=True) +class SearchHit: + text: str + score: float | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + +class MemoryBackend(Protocol): + name: str + + def ingest(self, record: MemoryRecord) -> None: ... + + def search(self, query: str, top_k: int) -> list[SearchHit]: ... + + def usage(self) -> dict[str, int]: ... + + def close(self) -> None: ... + + +class MemantoBackend: + name = "memanto-on-prem" + + def __init__(self, base_url: str, run_id: str) -> None: + from memanto.app.clients.moorcheh import moorcheh_client + from memanto.app.config import settings + from memanto.cli.client.sdk_client import SdkClient + + settings.MEMANTO_BACKEND = "on-prem" + settings.MOORCHEH_ONPREM_URL = base_url + moorcheh_client.reset_client() + + self.agent_id = f"bench-{run_id}-memanto" + self.client = SdkClient(api_key="on-prem-local") + create_memanto_agent( + self.client, + agent_id=self.agent_id, + pattern="tool", + description="Temporal memory benchmark", + ) + self.client.activate_agent(self.agent_id, duration_hours=2) + + def ingest(self, record: MemoryRecord) -> None: + self.client.remember( + agent_id=self.agent_id, + memory_type=record.memory_type, + title=record.record_id, + content=f"[{record.record_id}] {record.text}", + confidence=1.0, + tags=[f"session-{record.session:02d}", record.record_id], + source="benchmark", + provenance="explicit_statement", + ) + + def search(self, query: str, top_k: int) -> list[SearchHit]: + response = self.client.recall( + agent_id=self.agent_id, + query=query, + limit=top_k, + min_similarity=0.0, + ) + hits = [] + for item in response.get("memories", []): + text = item.get("content") or item.get("text") or "" + hits.append( + SearchHit( + text=text, + score=item.get("score"), + metadata={ + "id": item.get("id"), + "type": item.get("type"), + "tags": item.get("tags", []), + }, + ) + ) + return hits + + def usage(self) -> dict[str, int]: + return {"llm_calls": 0, "llm_input_tokens": 0, "llm_output_tokens": 0} + + def close(self) -> None: + try: + self.client.deactivate_agent(self.agent_id) + except Exception: + pass + + +def create_memanto_agent( + client: Any, + *, + agent_id: str, + pattern: str, + description: str, + attempts: int = 5, + delay_s: float = 1.0, +) -> None: + """Retry idempotent bootstrap when On-Prem commits before returning 500.""" + + if attempts < 1: + raise ValueError("attempts must be at least 1") + last_error: Exception | None = None + for attempt in range(attempts): + try: + client.create_agent( + agent_id=agent_id, + pattern=pattern, + description=description, + ) + return + except Exception as error: + last_error = error + if attempt + 1 < attempts: + time.sleep(delay_s) + assert last_error is not None + raise last_error + + +class MeteredOllamaClient: + """Proxy an Ollama client while retaining native token counters.""" + + def __init__(self, wrapped: Any) -> None: + self.wrapped = wrapped + self.calls = 0 + self.input_tokens = 0 + self.output_tokens = 0 + + def __getattr__(self, name: str) -> Any: + return getattr(self.wrapped, name) + + def chat(self, *args: Any, **kwargs: Any) -> Any: + response = self.wrapped.chat(*args, **kwargs) + self.calls += 1 + self.input_tokens += _response_int(response, "prompt_eval_count") + self.output_tokens += _response_int(response, "eval_count") + return response + + +def _response_int(response: Any, key: str) -> int: + if isinstance(response, dict): + value = response.get(key, 0) + else: + value = getattr(response, key, 0) + return int(value or 0) + + +class Mem0Backend: + def __init__( + self, + *, + ollama_url: str, + llm_model: str, + run_id: str, + infer: bool, + work_dir: Path, + ) -> None: + from mem0 import Memory + + self.infer = infer + self.name = "mem0-agentic" if infer else "mem0-direct" + self.user_id = f"bench-{run_id}-{self.name}" + self.work_dir = work_dir / self.name + if self.work_dir.exists(): + shutil.rmtree(self.work_dir) + self.work_dir.mkdir(parents=True) + + config = build_mem0_config( + ollama_url=ollama_url, + llm_model=llm_model, + run_id=run_id, + backend_name=self.name, + work_dir=self.work_dir, + ) + os.environ.setdefault("MEM0_TELEMETRY", "false") + self.memory = Memory.from_config(config) + self.meter: MeteredOllamaClient | None = None + if infer: + self.meter = MeteredOllamaClient(self.memory.llm.client) + self.memory.llm.client = self.meter + + def ingest(self, record: MemoryRecord) -> None: + self.memory.add( + [{"role": "user", "content": f"[{record.record_id}] {record.text}"}], + user_id=self.user_id, + metadata={ + "record_id": record.record_id, + "session": record.session, + "memory_type": record.memory_type, + }, + infer=self.infer, + ) + + def search(self, query: str, top_k: int) -> list[SearchHit]: + response = self.memory.search( + query, + top_k=top_k, + filters={"user_id": self.user_id}, + threshold=0.0, + ) + hits = [] + for item in response.get("results", []): + hits.append( + SearchHit( + text=item.get("memory") or item.get("text") or "", + score=item.get("score"), + metadata=item.get("metadata") or {}, + ) + ) + return hits + + def usage(self) -> dict[str, int]: + if self.meter is None: + return {"llm_calls": 0, "llm_input_tokens": 0, "llm_output_tokens": 0} + return { + "llm_calls": self.meter.calls, + "llm_input_tokens": self.meter.input_tokens, + "llm_output_tokens": self.meter.output_tokens, + } + + def close(self) -> None: + close = getattr(self.memory.vector_store.client, "close", None) + if callable(close): + close() + + +def build_mem0_config( + *, + ollama_url: str, + llm_model: str, + run_id: str, + backend_name: str, + work_dir: Path, +) -> dict[str, Any]: + return { + "version": "v1.1", + "llm": { + "provider": "ollama", + "config": { + "model": llm_model, + "ollama_base_url": ollama_url, + "temperature": 0.0, + "max_tokens": 1200, + "top_p": 0.1, + }, + }, + "embedder": { + "provider": "ollama", + "config": { + "model": "nomic-embed-text", + "ollama_base_url": ollama_url, + "embedding_dims": 768, + }, + }, + "vector_store": { + "provider": "qdrant", + "config": { + "collection_name": f"temporal_{run_id}_{backend_name}", + "path": str(work_dir / "qdrant"), + "on_disk": True, + "embedding_model_dims": 768, + }, + }, + "history_db_path": str(work_dir / "history.db"), + } + + +def wait_until_searchable( + backend: MemoryBackend, + *, + query: str, + expected: str, + top_k: int, + timeout_s: float, +) -> float: + started = time.perf_counter() + deadline = started + timeout_s + expected_lower = expected.casefold() + while time.perf_counter() < deadline: + hits = backend.search(query, top_k) + if expected_lower in "\n".join(hit.text for hit in hits).casefold(): + return time.perf_counter() - started + time.sleep(1.0) + raise TimeoutError( + f"{backend.name} did not surface {expected!r} within {timeout_s:.0f}s" + ) + + +def new_run_id() -> str: + return uuid.uuid4().hex[:10] diff --git a/examples/benchmarks/temporal-memory-showdown/dataset.py b/examples/benchmarks/temporal-memory-showdown/dataset.py new file mode 100644 index 00000000..458833bd --- /dev/null +++ b/examples/benchmarks/temporal-memory-showdown/dataset.py @@ -0,0 +1,364 @@ +"""Synthetic long-horizon dataset with explicit temporal contradictions.""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class MemoryRecord: + record_id: str + session: int + memory_type: str + text: str + + +@dataclass(frozen=True) +class QueryCase: + query_id: str + category: str + query: str + required: tuple[tuple[str, ...], ...] + forbidden: tuple[tuple[str, ...], ...] = () + + +RECORDS: tuple[MemoryRecord, ...] = ( + MemoryRecord( + "M001", + 1, + "fact", + "Asteria's greenhouse module is GH-7. The initial crop is Genovese basil.", + ), + MemoryRecord( + "M002", + 1, + "event", + "The original launch window is August 14, 2026, from Kiruna.", + ), + MemoryRecord( + "M003", + 1, + "relationship", + "Elena Park is mission commander. Dr. Amara Okafor owns plant science.", + ), + MemoryRecord( + "M004", + 1, + "instruction", + "Operational alerts go to Slack channel #asteria-ops.", + ), + MemoryRecord( + "M005", + 2, + "decision", + "The baseline nutrient recipe is N-17 with a target root-zone pH of 6.2.", + ), + MemoryRecord( + "M006", + 2, + "fact", + "The greenhouse thermal ceiling is 24 C. Backup oxygen lasts 18 hours.", + ), + MemoryRecord( + "M007", + 2, + "decision", + "Helios Agritech is the approved seed vendor under purchase order PO-81.", + ), + MemoryRecord( + "M008", + 3, + "instruction", + "Emergency valve recovery uses procedure V1: isolate line B, then cycle pump 2.", + ), + MemoryRecord( + "M009", + 3, + "fact", + "The original landing site is Malapert Ridge. The mission call sign is Aurora.", + ), + MemoryRecord( + "M010", + 3, + "decision", + "Telemetry is retained for 90 days. The approved program cap is USD 420,000.", + ), + MemoryRecord( + "M011", + 4, + "decision", + "Crop plan revision C-2 replaces basil with dwarf radish for the flight trial.", + ), + MemoryRecord( + "M012", + 4, + "observation", + "Dwarf radish completed germination testing in GH-7 with no mold detected.", + ), + MemoryRecord( + "M013", + 4, + "instruction", + "Do not load Genovese basil seed into the flight cassette after revision C-2.", + ), + MemoryRecord( + "M014", + 5, + "decision", + "Launch review L-3 moves the launch from August 14 to September 2, 2026.", + ), + MemoryRecord( + "M015", + 5, + "relationship", + "Priya Nair replaces Elena Park as mission commander. Elena remains an adviser.", + ), + MemoryRecord( + "M016", + 5, + "instruction", + "The current operations channel is Matrix room #asteria-flight, not Slack.", + ), + MemoryRecord( + "M017", + 6, + "decision", + "Nutrient protocol N-21 supersedes N-17 and lowers target root-zone pH to 5.9.", + ), + MemoryRecord( + "M018", + 6, + "instruction", + "The 24 C thermal ceiling is revoked. Hold GH-7 at or below 22 C.", + ), + MemoryRecord( + "M019", + 6, + "fact", + "Battery pack BX-9 supplies the greenhouse controller during transfer.", + ), + MemoryRecord( + "M020", + 7, + "decision", + "Site review S-4 selects Shackleton rim and rejects Malapert Ridge.", + ), + MemoryRecord( + "M021", + 7, + "decision", + "The finance board reduces the program cap from USD 420,000 to USD 390,000.", + ), + MemoryRecord( + "M022", + 7, + "event", + "Helios Agritech lot H-44 is recalled after contamination screening.", + ), + MemoryRecord( + "M023", + 8, + "decision", + "Nova Seedworks becomes the approved vendor under replacement order PO-96.", + ), + MemoryRecord( + "M024", + 8, + "instruction", + "Procedure V3 supersedes V1: isolate line C, vent for 30 seconds, then cycle pump 1.", + ), + MemoryRecord( + "M025", + 8, + "instruction", + "Never use valve procedure V1 during flight operations.", + ), + MemoryRecord( + "M026", + 9, + "decision", + "Security review shortens telemetry retention from 90 days to 30 days.", + ), + MemoryRecord( + "M027", + 9, + "decision", + "The mission call sign changes from Aurora to Lumen for all current traffic.", + ), + MemoryRecord( + "M028", + 9, + "commitment", + "Dr. Okafor will deliver the final radish growth report before August 20.", + ), + MemoryRecord( + "M029", + 10, + "fact", + "Final readiness confirms GH-7, dwarf radish, N-21, and the 22 C ceiling.", + ), + MemoryRecord( + "M030", + 10, + "fact", + "Final command brief lists Priya Nair, Shackleton rim, and call sign Lumen.", + ), + MemoryRecord( + "M031", + 10, + "instruction", + "Send current alerts only to Matrix room #asteria-flight.", + ), + MemoryRecord( + "M032", + 10, + "fact", + "Final logistics lists Nova Seedworks PO-96 and battery pack BX-9.", + ), +) + + +QUERIES: tuple[QueryCase, ...] = ( + QueryCase( + "Q01", + "current-state", + "What crop is currently approved for the Asteria flight trial?", + (("dwarf radish", "radish"),), + (("genovese basil", "basil"),), + ), + QueryCase( + "Q02", + "current-state", + "What is the current launch date?", + (("september 2", "september 2, 2026"),), + (("august 14",),), + ), + QueryCase( + "Q03", + "current-state", + "Who is the current mission commander?", + (("priya nair",),), + (("elena park",),), + ), + QueryCase( + "Q04", + "current-state", + "Where should current operational alerts be sent?", + (("matrix",), ("#asteria-flight", "asteria-flight")), + (("slack", "#asteria-ops", "asteria-ops"),), + ), + QueryCase( + "Q05", + "current-state", + "Which nutrient protocol and pH target are active?", + (("n-21",), ("5.9",)), + (("n-17",), ("6.2",)), + ), + QueryCase( + "Q06", + "current-state", + "What is the active greenhouse thermal ceiling?", + (("22 c", "22c", "22 degrees"),), + (("24 c", "24c", "24 degrees"),), + ), + QueryCase( + "Q07", + "current-state", + "Which landing site is selected now?", + (("shackleton rim", "shackleton"),), + (("malapert ridge", "malapert"),), + ), + QueryCase( + "Q08", + "current-state", + "What is the current approved program cap?", + (("390,000", "390000", "390k"),), + (("420,000", "420000", "420k"),), + ), + QueryCase( + "Q09", + "current-state", + "Which seed vendor and purchase order are currently approved?", + (("nova seedworks",), ("po-96",)), + (("helios agritech", "po-81"),), + ), + QueryCase( + "Q10", + "current-state", + "Which emergency valve procedure is active?", + (("v3",), ("line c",), ("pump 1",)), + (("v1", "line b", "pump 2"),), + ), + QueryCase( + "Q11", + "current-state", + "What are the current telemetry retention period and mission call sign?", + (("30 days",), ("lumen",)), + (("90 days",), ("aurora",)), + ), + QueryCase( + "Q12", + "historical", + "What crop was approved before revision C-2?", + (("genovese basil", "basil"),), + (("dwarf radish",),), + ), + QueryCase( + "Q13", + "historical", + "Who was mission commander before Priya Nair?", + (("elena park",),), + (("priya nair",),), + ), + QueryCase( + "Q14", + "historical", + "What was the original landing site?", + (("malapert ridge", "malapert"),), + (("shackleton rim",),), + ), + QueryCase( + "Q15", + "historical", + "Which valve procedure was used before V3?", + (("v1",), ("line b",), ("pump 2",)), + (("line c", "pump 1"),), + ), + QueryCase( + "Q16", + "multi-hop", + "Prepare the current command brief: commander, landing site, and call sign.", + (("priya nair",), ("shackleton rim", "shackleton"), ("lumen",)), + (("elena park",), ("malapert ridge",), ("aurora",)), + ), + QueryCase( + "Q17", + "multi-hop", + "Prepare the current greenhouse brief: module, crop, nutrient protocol, and temperature limit.", + (("gh-7",), ("dwarf radish", "radish"), ("n-21",), ("22 c", "22c")), + (("basil",), ("n-17",), ("24 c", "24c")), + ), + QueryCase( + "Q18", + "multi-hop", + "Prepare current logistics: vendor, order, and transfer battery.", + (("nova seedworks",), ("po-96",), ("bx-9",)), + (("helios agritech", "po-81"),), + ), +) + + +def validate_dataset() -> None: + record_ids = [record.record_id for record in RECORDS] + query_ids = [query.query_id for query in QUERIES] + if len(record_ids) != len(set(record_ids)): + raise ValueError("Duplicate memory record id") + if len(query_ids) != len(set(query_ids)): + raise ValueError("Duplicate query id") + if any(record.session < 1 for record in RECORDS): + raise ValueError("Sessions must be positive") + if any(not query.required for query in QUERIES): + raise ValueError("Every query needs at least one required concept") + + +validate_dataset() diff --git a/examples/benchmarks/temporal-memory-showdown/metrics.py b/examples/benchmarks/temporal-memory-showdown/metrics.py new file mode 100644 index 00000000..5591f40b --- /dev/null +++ b/examples/benchmarks/temporal-memory-showdown/metrics.py @@ -0,0 +1,134 @@ +"""Deterministic scoring and statistics for the memory benchmark.""" + +from __future__ import annotations + +import math +import random +import re +from collections.abc import Iterable, Sequence +from dataclasses import asdict, dataclass +from statistics import mean + +import tiktoken +from dataset import QueryCase + +_ENCODING = tiktoken.get_encoding("cl100k_base") + + +@dataclass(frozen=True) +class QueryScore: + query_id: str + category: str + coverage: float + stale_leak: bool + exact: bool + matched_required: int + required_total: int + matched_forbidden: int + + def to_dict(self) -> dict: + return asdict(self) + + +def count_tokens(text: str) -> int: + return len(_ENCODING.encode(text)) + + +def percentile(values: Sequence[float], quantile: float) -> float: + if not values: + return 0.0 + if not 0.0 <= quantile <= 1.0: + raise ValueError("quantile must be between 0 and 1") + ordered = sorted(values) + rank = max(1, math.ceil(quantile * len(ordered))) + return ordered[min(rank - 1, len(ordered) - 1)] + + +def normalize(text: str) -> str: + lowered = text.casefold() + return re.sub(r"\s+", " ", lowered).strip() + + +def _group_matches(text: str, aliases: Iterable[str]) -> bool: + return any(normalize(alias) in text for alias in aliases) + + +def score_query(case: QueryCase, retrieved_text: str) -> QueryScore: + normalized = normalize(retrieved_text) + required_hits = sum( + _group_matches(normalized, aliases) for aliases in case.required + ) + forbidden_hits = sum( + _group_matches(normalized, aliases) for aliases in case.forbidden + ) + coverage = required_hits / len(case.required) + stale_leak = forbidden_hits > 0 + return QueryScore( + query_id=case.query_id, + category=case.category, + coverage=round(coverage, 6), + stale_leak=stale_leak, + exact=coverage == 1.0 and not stale_leak, + matched_required=required_hits, + required_total=len(case.required), + matched_forbidden=forbidden_hits, + ) + + +def summarize_scores(scores: Sequence[QueryScore]) -> dict: + if not scores: + return { + "mean_coverage": 0.0, + "exact_accuracy": 0.0, + "stale_leak_rate": 0.0, + "by_category": {}, + } + + by_category: dict[str, list[QueryScore]] = {} + for score in scores: + by_category.setdefault(score.category, []).append(score) + + return { + "mean_coverage": round(mean(s.coverage for s in scores), 6), + "exact_accuracy": round(mean(float(s.exact) for s in scores), 6), + "stale_leak_rate": round(mean(float(s.stale_leak) for s in scores), 6), + "by_category": { + category: { + "mean_coverage": round(mean(s.coverage for s in rows), 6), + "exact_accuracy": round(mean(float(s.exact) for s in rows), 6), + "stale_leak_rate": round(mean(float(s.stale_leak) for s in rows), 6), + "queries": len(rows), + } + for category, rows in sorted(by_category.items()) + }, + } + + +def paired_bootstrap_delta( + baseline: Sequence[QueryScore], + challenger: Sequence[QueryScore], + *, + samples: int = 5000, + seed: int = 639, +) -> dict: + if len(baseline) != len(challenger) or not baseline: + raise ValueError("Paired bootstrap requires equal non-empty score lists") + rng = random.Random(seed) + deltas: list[float] = [] + count = len(baseline) + for _ in range(samples): + indices = [rng.randrange(count) for _ in range(count)] + baseline_mean = mean(baseline[i].coverage for i in indices) + challenger_mean = mean(challenger[i].coverage for i in indices) + deltas.append(challenger_mean - baseline_mean) + + ordered = sorted(deltas) + lower = ordered[max(0, math.floor(0.025 * samples))] + upper = ordered[min(samples - 1, math.ceil(0.975 * samples) - 1)] + observed = mean(s.coverage for s in challenger) - mean(s.coverage for s in baseline) + return { + "observed_delta": round(observed, 6), + "ci95": [round(lower, 6), round(upper, 6)], + "samples": samples, + "seed": seed, + } diff --git a/examples/benchmarks/temporal-memory-showdown/requirements.txt b/examples/benchmarks/temporal-memory-showdown/requirements.txt new file mode 100644 index 00000000..f812ed71 --- /dev/null +++ b/examples/benchmarks/temporal-memory-showdown/requirements.txt @@ -0,0 +1,8 @@ +mem0ai==2.0.5 +moorcheh-client==0.1.3 +ollama==0.6.1 +psutil>=6,<8 +pytest>=8,<9 +pytest-asyncio>=0.21 +pytest-timeout>=2,<3 +tiktoken==0.12.0 diff --git a/examples/benchmarks/temporal-memory-showdown/results/.gitkeep b/examples/benchmarks/temporal-memory-showdown/results/.gitkeep new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/examples/benchmarks/temporal-memory-showdown/results/.gitkeep @@ -0,0 +1 @@ + diff --git a/examples/benchmarks/temporal-memory-showdown/results/latest.json b/examples/benchmarks/temporal-memory-showdown/results/latest.json new file mode 100644 index 00000000..fa32b33d --- /dev/null +++ b/examples/benchmarks/temporal-memory-showdown/results/latest.json @@ -0,0 +1,4430 @@ +{ + "schema_version": 1, + "run_id": "7c776889d4", + "config": { + "backends": [ + "memanto", + "mem0-direct", + "mem0-agentic" + ], + "top_k": 5, + "repeats": 5, + "llm_model": "qwen2.5:1.5b", + "moorcheh_url": "http://127.0.0.1:8080", + "ollama_url": "http://127.0.0.1:11434" + }, + "dataset": { + "records": 32, + "queries": 18, + "sessions": 10 + }, + "environment": { + "timestamp_utc": "2026-06-12T21:27:29.456574+00:00", + "python": "3.12.13 (main, Mar 4 2026, 02:26:36) [GCC 13.3.0]", + "platform": "Linux-6.17.0-1018-azure-x86_64-with-glibc2.39", + "machine": "x86_64", + "processor": "x86_64", + "logical_cpus": 4, + "memory_gb": 15.615, + "git_commit": "66219b45b2f578cb9dea029ee6757316f3abb3ea", + "docker": "28.0.4" + }, + "runs": [ + { + "backend": "memanto-on-prem", + "summary": { + "mean_coverage": 0.972222, + "exact_accuracy": 0.0, + "stale_leak_rate": 1.0, + "by_category": { + "current-state": { + "mean_coverage": 1.0, + "exact_accuracy": 0.0, + "stale_leak_rate": 1.0, + "queries": 11 + }, + "historical": { + "mean_coverage": 1.0, + "exact_accuracy": 0.0, + "stale_leak_rate": 1.0, + "queries": 4 + }, + "multi-hop": { + "mean_coverage": 0.833333, + "exact_accuracy": 0.0, + "stale_leak_rate": 1.0, + "queries": 3 + } + } + }, + "metrics": { + "records_ingested": 32, + "queries_evaluated": 18, + "top_k": 5, + "latency_repeats": 5, + "source_tokens": 551, + "retrieved_tokens": 1779, + "avg_retrieved_tokens_per_query": 98.833, + "ingest_total_s": 0.096153, + "ingest_p50_s": 0.002077, + "ingest_p95_s": 0.004931, + "index_ready_s": 4.823175, + "query_mean_s": 0.07348, + "query_p50_s": 0.069954, + "query_p95_s": 0.098346, + "client_rss_delta_mb": 5.676, + "llm_calls": 0, + "llm_input_tokens": 0, + "llm_output_tokens": 0 + }, + "scores": [ + { + "query_id": "Q01", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q02", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q03", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q04", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 1 + }, + { + "query_id": "Q05", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 2 + }, + { + "query_id": "Q06", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q07", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q08", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q09", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 1 + }, + { + "query_id": "Q10", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 1 + }, + { + "query_id": "Q11", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 2 + }, + { + "query_id": "Q12", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q13", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q14", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q15", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 1 + }, + { + "query_id": "Q16", + "category": "multi-hop", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 2 + }, + { + "query_id": "Q17", + "category": "multi-hop", + "coverage": 0.5, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 4, + "matched_forbidden": 3 + }, + { + "query_id": "Q18", + "category": "multi-hop", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 1 + } + ], + "queries": [ + { + "query_id": "Q01", + "category": "current-state", + "query": "What crop is currently approved for the Asteria flight trial?", + "retrieved_tokens": 103, + "score": { + "query_id": "Q01", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M011] Crop plan revision C-2 replaces basil with dwarf radish for the flight trial.", + "score": 0.439176, + "metadata": { + "id": "c6df5d1b-4dce-4a18-9cb5-0a872945b42f", + "type": "decision", + "tags": [ + "session-04", + "M011" + ] + } + }, + { + "text": "[M001] Asteria's greenhouse module is GH-7. The initial crop is Genovese basil.", + "score": 0.415482, + "metadata": { + "id": "f9b0346a-18f2-47c7-b57d-3080dd87109f", + "type": "fact", + "tags": [ + "session-01", + "M001" + ] + } + }, + { + "text": "[M007] Helios Agritech is the approved seed vendor under purchase order PO-81.", + "score": 0.396664, + "metadata": { + "id": "ffea9941-7443-4017-9061-f4fc4fc75d16", + "type": "decision", + "tags": [ + "session-02", + "M007" + ] + } + }, + { + "text": "[M012] Dwarf radish completed germination testing in GH-7 with no mold detected.", + "score": 0.373297, + "metadata": { + "id": "bf95a60b-03fd-4877-ab21-d46bc4b3a887", + "type": "observation", + "tags": [ + "session-04", + "M012" + ] + } + }, + { + "text": "[M005] The baseline nutrient recipe is N-17 with a target root-zone pH of 6.2.", + "score": 0.368643, + "metadata": { + "id": "d7cb6dab-e618-4fd5-9597-72bb6dc0d11b", + "type": "decision", + "tags": [ + "session-02", + "M005" + ] + } + } + ] + }, + { + "query_id": "Q02", + "category": "current-state", + "query": "What is the current launch date?", + "retrieved_tokens": 110, + "score": { + "query_id": "Q02", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M002] The original launch window is August 14, 2026, from Kiruna.", + "score": 0.415482, + "metadata": { + "id": "05efafef-ca00-4027-85ab-a919e6dcaace", + "type": "event", + "tags": [ + "session-01", + "M002" + ] + } + }, + { + "text": "[M014] Launch review L-3 moves the launch from August 14 to September 2, 2026.", + "score": 0.387297, + "metadata": { + "id": "62ddb53b-33d7-49b0-8e58-24f400df109d", + "type": "decision", + "tags": [ + "session-05", + "M014" + ] + } + }, + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.340848, + "metadata": { + "id": "81d82a6c-5b84-457c-9532-b90b16ad6136", + "type": "decision", + "tags": [ + "session-09", + "M027" + ] + } + }, + { + "text": "[M029] Final readiness confirms GH-7, dwarf radish, N-21, and the 22 C ceiling.", + "score": 0.31784, + "metadata": { + "id": "1baab304-e3f1-4d74-902f-26fcf76b11d2", + "type": "fact", + "tags": [ + "session-10", + "M029" + ] + } + }, + { + "text": "[M018] The 24 C thermal ceiling is revoked. Hold GH-7 at or below 22 C.", + "score": 0.304097, + "metadata": { + "id": "fc6d7079-d1e0-4e50-80bc-eac817ab6a16", + "type": "instruction", + "tags": [ + "session-06", + "M018" + ] + } + } + ] + }, + { + "query_id": "Q03", + "category": "current-state", + "query": "Who is the current mission commander?", + "retrieved_tokens": 90, + "score": { + "query_id": "Q03", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M015] Priya Nair replaces Elena Park as mission commander. Elena remains an adviser.", + "score": 0.373297, + "metadata": { + "id": "368569aa-1c3b-499b-9054-c5588c0798e3", + "type": "relationship", + "tags": [ + "session-05", + "M015" + ] + } + }, + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.359354, + "metadata": { + "id": "81d82a6c-5b84-457c-9532-b90b16ad6136", + "type": "decision", + "tags": [ + "session-09", + "M027" + ] + } + }, + { + "text": "[M016] The current operations channel is Matrix room #asteria-flight, not Slack.", + "score": 0.354719, + "metadata": { + "id": "64cb1f08-0d7d-4335-b085-346d8dfe0f94", + "type": "instruction", + "tags": [ + "session-05", + "M016" + ] + } + }, + { + "text": "[M003] Elena Park is mission commander. Dr. Amara Okafor owns plant science.", + "score": 0.35009, + "metadata": { + "id": "036341e9-a156-4bb6-88ee-5542b7bbb633", + "type": "relationship", + "tags": [ + "session-01", + "M003" + ] + } + }, + { + "text": "[M031] Send current alerts only to Matrix room #asteria-flight.", + "score": 0.340848, + "metadata": { + "id": "36840e1c-bed7-4b97-a03a-71b12684667a", + "type": "instruction", + "tags": [ + "session-10", + "M031" + ] + } + } + ] + }, + { + "query_id": "Q04", + "category": "current-state", + "query": "Where should current operational alerts be sent?", + "retrieved_tokens": 85, + "score": { + "query_id": "Q04", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M004] Operational alerts go to Slack channel #asteria-ops.", + "score": 0.458279, + "metadata": { + "id": "d74b1abe-a4c1-4e2a-b69b-dea5a73af200", + "type": "instruction", + "tags": [ + "session-01", + "M004" + ] + } + }, + { + "text": "[M031] Send current alerts only to Matrix room #asteria-flight.", + "score": 0.396664, + "metadata": { + "id": "36840e1c-bed7-4b97-a03a-71b12684667a", + "type": "instruction", + "tags": [ + "session-10", + "M031" + ] + } + }, + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.363996, + "metadata": { + "id": "81d82a6c-5b84-457c-9532-b90b16ad6136", + "type": "decision", + "tags": [ + "session-09", + "M027" + ] + } + }, + { + "text": "[M016] The current operations channel is Matrix room #asteria-flight, not Slack.", + "score": 0.331629, + "metadata": { + "id": "64cb1f08-0d7d-4335-b085-346d8dfe0f94", + "type": "instruction", + "tags": [ + "session-05", + "M016" + ] + } + }, + { + "text": "[M032] Final logistics lists Nova Seedworks PO-96 and battery pack BX-9.", + "score": 0.322431, + "metadata": { + "id": "7262fe31-c211-4ad3-836b-eeb7347e8a3a", + "type": "fact", + "tags": [ + "session-10", + "M032" + ] + } + } + ] + }, + { + "query_id": "Q05", + "category": "current-state", + "query": "Which nutrient protocol and pH target are active?", + "retrieved_tokens": 102, + "score": { + "query_id": "Q05", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 2 + }, + "hits": [ + { + "text": "[M017] Nutrient protocol N-21 supersedes N-17 and lowers target root-zone pH to 5.9.", + "score": 0.420205, + "metadata": { + "id": "3701a902-7157-41ec-be78-4f9ca96c72e9", + "type": "decision", + "tags": [ + "session-06", + "M017" + ] + } + }, + { + "text": "[M005] The baseline nutrient recipe is N-17 with a target root-zone pH of 6.2.", + "score": 0.420205, + "metadata": { + "id": "d7cb6dab-e618-4fd5-9597-72bb6dc0d11b", + "type": "decision", + "tags": [ + "session-02", + "M005" + ] + } + }, + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.336236, + "metadata": { + "id": "81d82a6c-5b84-457c-9532-b90b16ad6136", + "type": "decision", + "tags": [ + "session-09", + "M027" + ] + } + }, + { + "text": "[M004] Operational alerts go to Slack channel #asteria-ops.", + "score": 0.322431, + "metadata": { + "id": "d74b1abe-a4c1-4e2a-b69b-dea5a73af200", + "type": "instruction", + "tags": [ + "session-01", + "M004" + ] + } + }, + { + "text": "[M032] Final logistics lists Nova Seedworks PO-96 and battery pack BX-9.", + "score": 0.31784, + "metadata": { + "id": "7262fe31-c211-4ad3-836b-eeb7347e8a3a", + "type": "fact", + "tags": [ + "session-10", + "M032" + ] + } + } + ] + }, + { + "query_id": "Q06", + "category": "current-state", + "query": "What is the active greenhouse thermal ceiling?", + "retrieved_tokens": 97, + "score": { + "query_id": "Q06", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M006] The greenhouse thermal ceiling is 24 C. Backup oxygen lasts 18 hours.", + "score": 0.458279, + "metadata": { + "id": "e1f6be61-65eb-4e9f-9b29-191aa02aac21", + "type": "fact", + "tags": [ + "session-02", + "M006" + ] + } + }, + { + "text": "[M018] The 24 C thermal ceiling is revoked. Hold GH-7 at or below 22 C.", + "score": 0.429675, + "metadata": { + "id": "fc6d7079-d1e0-4e50-80bc-eac817ab6a16", + "type": "instruction", + "tags": [ + "session-06", + "M018" + ] + } + }, + { + "text": "[M001] Asteria's greenhouse module is GH-7. The initial crop is Genovese basil.", + "score": 0.345466, + "metadata": { + "id": "f9b0346a-18f2-47c7-b57d-3080dd87109f", + "type": "fact", + "tags": [ + "session-01", + "M001" + ] + } + }, + { + "text": "[M019] Battery pack BX-9 supplies the greenhouse controller during transfer.", + "score": 0.336236, + "metadata": { + "id": "b2973696-700b-48f8-b4e8-4e3c99b09966", + "type": "fact", + "tags": [ + "session-06", + "M019" + ] + } + }, + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.322431, + "metadata": { + "id": "81d82a6c-5b84-457c-9532-b90b16ad6136", + "type": "decision", + "tags": [ + "session-09", + "M027" + ] + } + } + ] + }, + { + "query_id": "Q07", + "category": "current-state", + "query": "Which landing site is selected now?", + "retrieved_tokens": 101, + "score": { + "query_id": "Q07", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M014] Launch review L-3 moves the launch from August 14 to September 2, 2026.", + "score": 0.363996, + "metadata": { + "id": "62ddb53b-33d7-49b0-8e58-24f400df109d", + "type": "decision", + "tags": [ + "session-05", + "M014" + ] + } + }, + { + "text": "[M020] Site review S-4 selects Shackleton rim and rejects Malapert Ridge.", + "score": 0.345466, + "metadata": { + "id": "99bbae2b-90f5-4c1d-a8b1-b8646878bf01", + "type": "decision", + "tags": [ + "session-07", + "M020" + ] + } + }, + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.336236, + "metadata": { + "id": "81d82a6c-5b84-457c-9532-b90b16ad6136", + "type": "decision", + "tags": [ + "session-09", + "M027" + ] + } + }, + { + "text": "[M009] The original landing site is Malapert Ridge. The mission call sign is Aurora.", + "score": 0.322431, + "metadata": { + "id": "9eebacf1-4313-4ea5-a23b-d061e2d39248", + "type": "fact", + "tags": [ + "session-03", + "M009" + ] + } + }, + { + "text": "[M002] The original launch window is August 14, 2026, from Kiruna.", + "score": 0.31784, + "metadata": { + "id": "05efafef-ca00-4027-85ab-a919e6dcaace", + "type": "event", + "tags": [ + "session-01", + "M002" + ] + } + } + ] + }, + { + "query_id": "Q08", + "category": "current-state", + "query": "What is the current approved program cap?", + "retrieved_tokens": 113, + "score": { + "query_id": "Q08", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M021] The finance board reduces the program cap from USD 420,000 to USD 390,000.", + "score": 0.424936, + "metadata": { + "id": "980646f6-506a-4809-a032-b5a020a89c17", + "type": "decision", + "tags": [ + "session-07", + "M021" + ] + } + }, + { + "text": "[M010] Telemetry is retained for 90 days. The approved program cap is USD 420,000.", + "score": 0.387297, + "metadata": { + "id": "85a51ade-1572-4618-a81c-6f98acc301c3", + "type": "decision", + "tags": [ + "session-03", + "M010" + ] + } + }, + { + "text": "[M018] The 24 C thermal ceiling is revoked. Hold GH-7 at or below 22 C.", + "score": 0.340848, + "metadata": { + "id": "fc6d7079-d1e0-4e50-80bc-eac817ab6a16", + "type": "instruction", + "tags": [ + "session-06", + "M018" + ] + } + }, + { + "text": "[M017] Nutrient protocol N-21 supersedes N-17 and lowers target root-zone pH to 5.9.", + "score": 0.331629, + "metadata": { + "id": "3701a902-7157-41ec-be78-4f9ca96c72e9", + "type": "decision", + "tags": [ + "session-06", + "M017" + ] + } + }, + { + "text": "[M023] Nova Seedworks becomes the approved vendor under replacement order PO-96.", + "score": 0.331629, + "metadata": { + "id": "0eb4ec8d-d827-41af-9861-eb6334e5b181", + "type": "decision", + "tags": [ + "session-08", + "M023" + ] + } + } + ] + }, + { + "query_id": "Q09", + "category": "current-state", + "query": "Which seed vendor and purchase order are currently approved?", + "retrieved_tokens": 97, + "score": { + "query_id": "Q09", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M007] Helios Agritech is the approved seed vendor under purchase order PO-81.", + "score": 0.521407, + "metadata": { + "id": "ffea9941-7443-4017-9061-f4fc4fc75d16", + "type": "decision", + "tags": [ + "session-02", + "M007" + ] + } + }, + { + "text": "[M023] Nova Seedworks becomes the approved vendor under replacement order PO-96.", + "score": 0.477524, + "metadata": { + "id": "0eb4ec8d-d827-41af-9861-eb6334e5b181", + "type": "decision", + "tags": [ + "session-08", + "M023" + ] + } + }, + { + "text": "[M032] Final logistics lists Nova Seedworks PO-96 and battery pack BX-9.", + "score": 0.336236, + "metadata": { + "id": "7262fe31-c211-4ad3-836b-eeb7347e8a3a", + "type": "fact", + "tags": [ + "session-10", + "M032" + ] + } + }, + { + "text": "[M005] The baseline nutrient recipe is N-17 with a target root-zone pH of 6.2.", + "score": 0.322431, + "metadata": { + "id": "d7cb6dab-e618-4fd5-9597-72bb6dc0d11b", + "type": "decision", + "tags": [ + "session-02", + "M005" + ] + } + }, + { + "text": "[M020] Site review S-4 selects Shackleton rim and rejects Malapert Ridge.", + "score": 0.31784, + "metadata": { + "id": "99bbae2b-90f5-4c1d-a8b1-b8646878bf01", + "type": "decision", + "tags": [ + "session-07", + "M020" + ] + } + } + ] + }, + { + "query_id": "Q10", + "category": "current-state", + "query": "Which emergency valve procedure is active?", + "retrieved_tokens": 95, + "score": { + "query_id": "Q10", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M008] Emergency valve recovery uses procedure V1: isolate line B, then cycle pump 2.", + "score": 0.410767, + "metadata": { + "id": "1e6d2df9-9ea8-440c-87e2-c56391854cc8", + "type": "instruction", + "tags": [ + "session-03", + "M008" + ] + } + }, + { + "text": "[M025] Never use valve procedure V1 during flight operations.", + "score": 0.382624, + "metadata": { + "id": "1d83725e-df20-4137-b348-ea013c3d39bc", + "type": "instruction", + "tags": [ + "session-08", + "M025" + ] + } + }, + { + "text": "[M004] Operational alerts go to Slack channel #asteria-ops.", + "score": 0.359354, + "metadata": { + "id": "d74b1abe-a4c1-4e2a-b69b-dea5a73af200", + "type": "instruction", + "tags": [ + "session-01", + "M004" + ] + } + }, + { + "text": "[M024] Procedure V3 supersedes V1: isolate line C, vent for 30 seconds, then cycle pump 1.", + "score": 0.327027, + "metadata": { + "id": "da111097-7163-46fd-a65b-35bf5cca953f", + "type": "instruction", + "tags": [ + "session-08", + "M024" + ] + } + }, + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.308673, + "metadata": { + "id": "81d82a6c-5b84-457c-9532-b90b16ad6136", + "type": "decision", + "tags": [ + "session-09", + "M027" + ] + } + } + ] + }, + { + "query_id": "Q11", + "category": "current-state", + "query": "What are the current telemetry retention period and mission call sign?", + "retrieved_tokens": 96, + "score": { + "query_id": "Q11", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 2 + }, + "hits": [ + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.458279, + "metadata": { + "id": "81d82a6c-5b84-457c-9532-b90b16ad6136", + "type": "decision", + "tags": [ + "session-09", + "M027" + ] + } + }, + { + "text": "[M026] Security review shortens telemetry retention from 90 days to 30 days.", + "score": 0.443939, + "metadata": { + "id": "5d806565-6ab8-42d1-a7e9-dba8a2838c3a", + "type": "decision", + "tags": [ + "session-09", + "M026" + ] + } + }, + { + "text": "[M010] Telemetry is retained for 90 days. The approved program cap is USD 420,000.", + "score": 0.439176, + "metadata": { + "id": "85a51ade-1572-4618-a81c-6f98acc301c3", + "type": "decision", + "tags": [ + "session-03", + "M010" + ] + } + }, + { + "text": "[M022] Helios Agritech lot H-44 is recalled after contamination screening.", + "score": 0.382624, + "metadata": { + "id": "a9bc0e28-d443-41a5-a392-a1d24dc83897", + "type": "event", + "tags": [ + "session-07", + "M022" + ] + } + }, + { + "text": "[M009] The original landing site is Malapert Ridge. The mission call sign is Aurora.", + "score": 0.373297, + "metadata": { + "id": "9eebacf1-4313-4ea5-a23b-d061e2d39248", + "type": "fact", + "tags": [ + "session-03", + "M009" + ] + } + } + ] + }, + { + "query_id": "Q12", + "category": "historical", + "query": "What crop was approved before revision C-2?", + "retrieved_tokens": 100, + "score": { + "query_id": "Q12", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M011] Crop plan revision C-2 replaces basil with dwarf radish for the flight trial.", + "score": 0.410767, + "metadata": { + "id": "c6df5d1b-4dce-4a18-9cb5-0a872945b42f", + "type": "decision", + "tags": [ + "session-04", + "M011" + ] + } + }, + { + "text": "[M007] Helios Agritech is the approved seed vendor under purchase order PO-81.", + "score": 0.410767, + "metadata": { + "id": "ffea9941-7443-4017-9061-f4fc4fc75d16", + "type": "decision", + "tags": [ + "session-02", + "M007" + ] + } + }, + { + "text": "[M029] Final readiness confirms GH-7, dwarf radish, N-21, and the 22 C ceiling.", + "score": 0.396664, + "metadata": { + "id": "1baab304-e3f1-4d74-902f-26fcf76b11d2", + "type": "fact", + "tags": [ + "session-10", + "M029" + ] + } + }, + { + "text": "[M012] Dwarf radish completed germination testing in GH-7 with no mold detected.", + "score": 0.382624, + "metadata": { + "id": "bf95a60b-03fd-4877-ab21-d46bc4b3a887", + "type": "observation", + "tags": [ + "session-04", + "M012" + ] + } + }, + { + "text": "[M023] Nova Seedworks becomes the approved vendor under replacement order PO-96.", + "score": 0.377957, + "metadata": { + "id": "0eb4ec8d-d827-41af-9861-eb6334e5b181", + "type": "decision", + "tags": [ + "session-08", + "M023" + ] + } + } + ] + }, + { + "query_id": "Q13", + "category": "historical", + "query": "Who was mission commander before Priya Nair?", + "retrieved_tokens": 103, + "score": { + "query_id": "Q13", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M015] Priya Nair replaces Elena Park as mission commander. Elena remains an adviser.", + "score": 0.45349, + "metadata": { + "id": "368569aa-1c3b-499b-9054-c5588c0798e3", + "type": "relationship", + "tags": [ + "session-05", + "M015" + ] + } + }, + { + "text": "[M003] Elena Park is mission commander. Dr. Amara Okafor owns plant science.", + "score": 0.387297, + "metadata": { + "id": "036341e9-a156-4bb6-88ee-5542b7bbb633", + "type": "relationship", + "tags": [ + "session-01", + "M003" + ] + } + }, + { + "text": "[M030] Final command brief lists Priya Nair, Shackleton rim, and call sign Lumen.", + "score": 0.377957, + "metadata": { + "id": "0d109ee2-62b5-4ff3-b28a-b747f0c23152", + "type": "fact", + "tags": [ + "session-10", + "M030" + ] + } + }, + { + "text": "[M014] Launch review L-3 moves the launch from August 14 to September 2, 2026.", + "score": 0.331629, + "metadata": { + "id": "62ddb53b-33d7-49b0-8e58-24f400df109d", + "type": "decision", + "tags": [ + "session-05", + "M014" + ] + } + }, + { + "text": "[M016] The current operations channel is Matrix room #asteria-flight, not Slack.", + "score": 0.327027, + "metadata": { + "id": "64cb1f08-0d7d-4335-b085-346d8dfe0f94", + "type": "instruction", + "tags": [ + "session-05", + "M016" + ] + } + } + ] + }, + { + "query_id": "Q14", + "category": "historical", + "query": "What was the original landing site?", + "retrieved_tokens": 100, + "score": { + "query_id": "Q14", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M009] The original landing site is Malapert Ridge. The mission call sign is Aurora.", + "score": 0.382624, + "metadata": { + "id": "9eebacf1-4313-4ea5-a23b-d061e2d39248", + "type": "fact", + "tags": [ + "session-03", + "M009" + ] + } + }, + { + "text": "[M020] Site review S-4 selects Shackleton rim and rejects Malapert Ridge.", + "score": 0.281286, + "metadata": { + "id": "99bbae2b-90f5-4c1d-a8b1-b8646878bf01", + "type": "decision", + "tags": [ + "session-07", + "M020" + ] + } + }, + { + "text": "[M002] The original launch window is August 14, 2026, from Kiruna.", + "score": 0.281286, + "metadata": { + "id": "05efafef-ca00-4027-85ab-a919e6dcaace", + "type": "event", + "tags": [ + "session-01", + "M002" + ] + } + }, + { + "text": "[M030] Final command brief lists Priya Nair, Shackleton rim, and call sign Lumen.", + "score": 0.267653, + "metadata": { + "id": "0d109ee2-62b5-4ff3-b28a-b747f0c23152", + "type": "fact", + "tags": [ + "session-10", + "M030" + ] + } + }, + { + "text": "[M032] Final logistics lists Nova Seedworks PO-96 and battery pack BX-9.", + "score": 0.258584, + "metadata": { + "id": "7262fe31-c211-4ad3-836b-eeb7347e8a3a", + "type": "fact", + "tags": [ + "session-10", + "M032" + ] + } + } + ] + }, + { + "query_id": "Q15", + "category": "historical", + "query": "Which valve procedure was used before V3?", + "retrieved_tokens": 103, + "score": { + "query_id": "Q15", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M024] Procedure V3 supersedes V1: isolate line C, vent for 30 seconds, then cycle pump 1.", + "score": 0.382624, + "metadata": { + "id": "da111097-7163-46fd-a65b-35bf5cca953f", + "type": "instruction", + "tags": [ + "session-08", + "M024" + ] + } + }, + { + "text": "[M008] Emergency valve recovery uses procedure V1: isolate line B, then cycle pump 2.", + "score": 0.368643, + "metadata": { + "id": "1e6d2df9-9ea8-440c-87e2-c56391854cc8", + "type": "instruction", + "tags": [ + "session-03", + "M008" + ] + } + }, + { + "text": "[M025] Never use valve procedure V1 during flight operations.", + "score": 0.363996, + "metadata": { + "id": "1d83725e-df20-4137-b348-ea013c3d39bc", + "type": "instruction", + "tags": [ + "session-08", + "M025" + ] + } + }, + { + "text": "[M023] Nova Seedworks becomes the approved vendor under replacement order PO-96.", + "score": 0.313254, + "metadata": { + "id": "0eb4ec8d-d827-41af-9861-eb6334e5b181", + "type": "decision", + "tags": [ + "session-08", + "M023" + ] + } + }, + { + "text": "[M014] Launch review L-3 moves the launch from August 14 to September 2, 2026.", + "score": 0.304097, + "metadata": { + "id": "62ddb53b-33d7-49b0-8e58-24f400df109d", + "type": "decision", + "tags": [ + "session-05", + "M014" + ] + } + } + ] + }, + { + "query_id": "Q16", + "category": "multi-hop", + "query": "Prepare the current command brief: commander, landing site, and call sign.", + "retrieved_tokens": 90, + "score": { + "query_id": "Q16", + "category": "multi-hop", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 2 + }, + "hits": [ + { + "text": "[M009] The original landing site is Malapert Ridge. The mission call sign is Aurora.", + "score": 0.401358, + "metadata": { + "id": "9eebacf1-4313-4ea5-a23b-d061e2d39248", + "type": "fact", + "tags": [ + "session-03", + "M009" + ] + } + }, + { + "text": "[M004] Operational alerts go to Slack channel #asteria-ops.", + "score": 0.368643, + "metadata": { + "id": "d74b1abe-a4c1-4e2a-b69b-dea5a73af200", + "type": "instruction", + "tags": [ + "session-01", + "M004" + ] + } + }, + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.363996, + "metadata": { + "id": "81d82a6c-5b84-457c-9532-b90b16ad6136", + "type": "decision", + "tags": [ + "session-09", + "M027" + ] + } + }, + { + "text": "[M031] Send current alerts only to Matrix room #asteria-flight.", + "score": 0.363996, + "metadata": { + "id": "36840e1c-bed7-4b97-a03a-71b12684667a", + "type": "instruction", + "tags": [ + "session-10", + "M031" + ] + } + }, + { + "text": "[M030] Final command brief lists Priya Nair, Shackleton rim, and call sign Lumen.", + "score": 0.359354, + "metadata": { + "id": "0d109ee2-62b5-4ff3-b28a-b747f0c23152", + "type": "fact", + "tags": [ + "session-10", + "M030" + ] + } + } + ] + }, + { + "query_id": "Q17", + "category": "multi-hop", + "query": "Prepare the current greenhouse brief: module, crop, nutrient protocol, and temperature limit.", + "retrieved_tokens": 106, + "score": { + "query_id": "Q17", + "category": "multi-hop", + "coverage": 0.5, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 4, + "matched_forbidden": 3 + }, + "hits": [ + { + "text": "[M019] Battery pack BX-9 supplies the greenhouse controller during transfer.", + "score": 0.420205, + "metadata": { + "id": "b2973696-700b-48f8-b4e8-4e3c99b09966", + "type": "fact", + "tags": [ + "session-06", + "M019" + ] + } + }, + { + "text": "[M001] Asteria's greenhouse module is GH-7. The initial crop is Genovese basil.", + "score": 0.415482, + "metadata": { + "id": "f9b0346a-18f2-47c7-b57d-3080dd87109f", + "type": "fact", + "tags": [ + "session-01", + "M001" + ] + } + }, + { + "text": "[M006] The greenhouse thermal ceiling is 24 C. Backup oxygen lasts 18 hours.", + "score": 0.391977, + "metadata": { + "id": "e1f6be61-65eb-4e9f-9b29-191aa02aac21", + "type": "fact", + "tags": [ + "session-02", + "M006" + ] + } + }, + { + "text": "[M017] Nutrient protocol N-21 supersedes N-17 and lowers target root-zone pH to 5.9.", + "score": 0.387297, + "metadata": { + "id": "3701a902-7157-41ec-be78-4f9ca96c72e9", + "type": "decision", + "tags": [ + "session-06", + "M017" + ] + } + }, + { + "text": "[M005] The baseline nutrient recipe is N-17 with a target root-zone pH of 6.2.", + "score": 0.387297, + "metadata": { + "id": "d7cb6dab-e618-4fd5-9597-72bb6dc0d11b", + "type": "decision", + "tags": [ + "session-02", + "M005" + ] + } + } + ] + }, + { + "query_id": "Q18", + "category": "multi-hop", + "query": "Prepare current logistics: vendor, order, and transfer battery.", + "retrieved_tokens": 88, + "score": { + "query_id": "Q18", + "category": "multi-hop", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M032] Final logistics lists Nova Seedworks PO-96 and battery pack BX-9.", + "score": 0.368643, + "metadata": { + "id": "7262fe31-c211-4ad3-836b-eeb7347e8a3a", + "type": "fact", + "tags": [ + "session-10", + "M032" + ] + } + }, + { + "text": "[M019] Battery pack BX-9 supplies the greenhouse controller during transfer.", + "score": 0.345466, + "metadata": { + "id": "b2973696-700b-48f8-b4e8-4e3c99b09966", + "type": "fact", + "tags": [ + "session-06", + "M019" + ] + } + }, + { + "text": "[M023] Nova Seedworks becomes the approved vendor under replacement order PO-96.", + "score": 0.331629, + "metadata": { + "id": "0eb4ec8d-d827-41af-9861-eb6334e5b181", + "type": "decision", + "tags": [ + "session-08", + "M023" + ] + } + }, + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.331629, + "metadata": { + "id": "81d82a6c-5b84-457c-9532-b90b16ad6136", + "type": "decision", + "tags": [ + "session-09", + "M027" + ] + } + }, + { + "text": "[M007] Helios Agritech is the approved seed vendor under purchase order PO-81.", + "score": 0.308673, + "metadata": { + "id": "ffea9941-7443-4017-9061-f4fc4fc75d16", + "type": "decision", + "tags": [ + "session-02", + "M007" + ] + } + } + ] + } + ] + }, + { + "backend": "mem0-direct", + "summary": { + "mean_coverage": 0.986111, + "exact_accuracy": 0.0, + "stale_leak_rate": 1.0, + "by_category": { + "current-state": { + "mean_coverage": 1.0, + "exact_accuracy": 0.0, + "stale_leak_rate": 1.0, + "queries": 11 + }, + "historical": { + "mean_coverage": 1.0, + "exact_accuracy": 0.0, + "stale_leak_rate": 1.0, + "queries": 4 + }, + "multi-hop": { + "mean_coverage": 0.916667, + "exact_accuracy": 0.0, + "stale_leak_rate": 1.0, + "queries": 3 + } + } + }, + "metrics": { + "records_ingested": 32, + "queries_evaluated": 18, + "top_k": 5, + "latency_repeats": 5, + "source_tokens": 551, + "retrieved_tokens": 1783, + "avg_retrieved_tokens_per_query": 99.056, + "ingest_total_s": 3.995826, + "ingest_p50_s": 0.123843, + "ingest_p95_s": 0.146265, + "index_ready_s": 0.071771, + "query_mean_s": 0.078567, + "query_p50_s": 0.074995, + "query_p95_s": 0.103795, + "client_rss_delta_mb": 4.332, + "llm_calls": 0, + "llm_input_tokens": 0, + "llm_output_tokens": 0 + }, + "scores": [ + { + "query_id": "Q01", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q02", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q03", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q04", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 1 + }, + { + "query_id": "Q05", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 2 + }, + { + "query_id": "Q06", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q07", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q08", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q09", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 1 + }, + { + "query_id": "Q10", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 1 + }, + { + "query_id": "Q11", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 2 + }, + { + "query_id": "Q12", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q13", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q14", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q15", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 1 + }, + { + "query_id": "Q16", + "category": "multi-hop", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 2 + }, + { + "query_id": "Q17", + "category": "multi-hop", + "coverage": 0.75, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 4, + "matched_forbidden": 3 + }, + { + "query_id": "Q18", + "category": "multi-hop", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 1 + } + ], + "queries": [ + { + "query_id": "Q01", + "category": "current-state", + "query": "What crop is currently approved for the Asteria flight trial?", + "retrieved_tokens": 97, + "score": { + "query_id": "Q01", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M001] Asteria's greenhouse module is GH-7. The initial crop is Genovese basil.", + "score": 0.7316455335503272, + "metadata": { + "record_id": "M001", + "session": 1, + "memory_type": "fact" + } + }, + { + "text": "[M011] Crop plan revision C-2 replaces basil with dwarf radish for the flight trial.", + "score": 0.7274233978409959, + "metadata": { + "record_id": "M011", + "session": 4, + "memory_type": "decision" + } + }, + { + "text": "[M007] Helios Agritech is the approved seed vendor under purchase order PO-81.", + "score": 0.6452078507768324, + "metadata": { + "record_id": "M007", + "session": 2, + "memory_type": "decision" + } + }, + { + "text": "[M013] Do not load Genovese basil seed into the flight cassette after revision C-2.", + "score": 0.6040653477816942, + "metadata": { + "record_id": "M013", + "session": 4, + "memory_type": "instruction" + } + }, + { + "text": "[M031] Send current alerts only to Matrix room #asteria-flight.", + "score": 0.5962135760379794, + "metadata": { + "record_id": "M031", + "session": 10, + "memory_type": "instruction" + } + } + ] + }, + { + "query_id": "Q02", + "category": "current-state", + "query": "What is the current launch date?", + "retrieved_tokens": 103, + "score": { + "query_id": "Q02", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M002] The original launch window is August 14, 2026, from Kiruna.", + "score": 0.7756558922583932, + "metadata": { + "record_id": "M002", + "session": 1, + "memory_type": "event" + } + }, + { + "text": "[M014] Launch review L-3 moves the launch from August 14 to September 2, 2026.", + "score": 0.7504998712532465, + "metadata": { + "record_id": "M014", + "session": 5, + "memory_type": "decision" + } + }, + { + "text": "[M028] Dr. Okafor will deliver the final radish growth report before August 20.", + "score": 0.5536388924160689, + "metadata": { + "record_id": "M028", + "session": 9, + "memory_type": "commitment" + } + }, + { + "text": "[M016] The current operations channel is Matrix room #asteria-flight, not Slack.", + "score": 0.5451480429271631, + "metadata": { + "record_id": "M016", + "session": 5, + "memory_type": "instruction" + } + }, + { + "text": "[M009] The original landing site is Malapert Ridge. The mission call sign is Aurora.", + "score": 0.5340067050882729, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + } + ] + }, + { + "query_id": "Q03", + "category": "current-state", + "query": "Who is the current mission commander?", + "retrieved_tokens": 90, + "score": { + "query_id": "Q03", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M015] Priya Nair replaces Elena Park as mission commander. Elena remains an adviser.", + "score": 0.6891962860208289, + "metadata": { + "record_id": "M015", + "session": 5, + "memory_type": "relationship" + } + }, + { + "text": "[M003] Elena Park is mission commander. Dr. Amara Okafor owns plant science.", + "score": 0.6546714243334143, + "metadata": { + "record_id": "M003", + "session": 1, + "memory_type": "relationship" + } + }, + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.5978281284473652, + "metadata": { + "record_id": "M027", + "session": 9, + "memory_type": "decision" + } + }, + { + "text": "[M016] The current operations channel is Matrix room #asteria-flight, not Slack.", + "score": 0.5886503096372522, + "metadata": { + "record_id": "M016", + "session": 5, + "memory_type": "instruction" + } + }, + { + "text": "[M004] Operational alerts go to Slack channel #asteria-ops.", + "score": 0.5571302746475111, + "metadata": { + "record_id": "M004", + "session": 1, + "memory_type": "instruction" + } + } + ] + }, + { + "query_id": "Q04", + "category": "current-state", + "query": "Where should current operational alerts be sent?", + "retrieved_tokens": 86, + "score": { + "query_id": "Q04", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M004] Operational alerts go to Slack channel #asteria-ops.", + "score": 0.735328460513914, + "metadata": { + "record_id": "M004", + "session": 1, + "memory_type": "instruction" + } + }, + { + "text": "[M031] Send current alerts only to Matrix room #asteria-flight.", + "score": 0.6710097855156496, + "metadata": { + "record_id": "M031", + "session": 10, + "memory_type": "instruction" + } + }, + { + "text": "[M016] The current operations channel is Matrix room #asteria-flight, not Slack.", + "score": 0.5789558680735702, + "metadata": { + "record_id": "M016", + "session": 5, + "memory_type": "instruction" + } + }, + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.564061849496196, + "metadata": { + "record_id": "M027", + "session": 9, + "memory_type": "decision" + } + }, + { + "text": "[M009] The original landing site is Malapert Ridge. The mission call sign is Aurora.", + "score": 0.5143508620365065, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + } + ] + }, + { + "query_id": "Q05", + "category": "current-state", + "query": "Which nutrient protocol and pH target are active?", + "retrieved_tokens": 107, + "score": { + "query_id": "Q05", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 2 + }, + "hits": [ + { + "text": "[M017] Nutrient protocol N-21 supersedes N-17 and lowers target root-zone pH to 5.9.", + "score": 0.7549154946619239, + "metadata": { + "record_id": "M017", + "session": 6, + "memory_type": "decision" + } + }, + { + "text": "[M005] The baseline nutrient recipe is N-17 with a target root-zone pH of 6.2.", + "score": 0.729098449640031, + "metadata": { + "record_id": "M005", + "session": 2, + "memory_type": "decision" + } + }, + { + "text": "[M001] Asteria's greenhouse module is GH-7. The initial crop is Genovese basil.", + "score": 0.5416408387829493, + "metadata": { + "record_id": "M001", + "session": 1, + "memory_type": "fact" + } + }, + { + "text": "[M004] Operational alerts go to Slack channel #asteria-ops.", + "score": 0.5268385900546638, + "metadata": { + "record_id": "M004", + "session": 1, + "memory_type": "instruction" + } + }, + { + "text": "[M003] Elena Park is mission commander. Dr. Amara Okafor owns plant science.", + "score": 0.523161724812621, + "metadata": { + "record_id": "M003", + "session": 1, + "memory_type": "relationship" + } + } + ] + }, + { + "query_id": "Q06", + "category": "current-state", + "query": "What is the active greenhouse thermal ceiling?", + "retrieved_tokens": 104, + "score": { + "query_id": "Q06", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M006] The greenhouse thermal ceiling is 24 C. Backup oxygen lasts 18 hours.", + "score": 0.7905405764720894, + "metadata": { + "record_id": "M006", + "session": 2, + "memory_type": "fact" + } + }, + { + "text": "[M018] The 24 C thermal ceiling is revoked. Hold GH-7 at or below 22 C.", + "score": 0.6964554937698523, + "metadata": { + "record_id": "M018", + "session": 6, + "memory_type": "instruction" + } + }, + { + "text": "[M001] Asteria's greenhouse module is GH-7. The initial crop is Genovese basil.", + "score": 0.6174272531060604, + "metadata": { + "record_id": "M001", + "session": 1, + "memory_type": "fact" + } + }, + { + "text": "[M029] Final readiness confirms GH-7, dwarf radish, N-21, and the 22 C ceiling.", + "score": 0.5741037509776863, + "metadata": { + "record_id": "M029", + "session": 10, + "memory_type": "fact" + } + }, + { + "text": "[M019] Battery pack BX-9 supplies the greenhouse controller during transfer.", + "score": 0.5638031395579187, + "metadata": { + "record_id": "M019", + "session": 6, + "memory_type": "fact" + } + } + ] + }, + { + "query_id": "Q07", + "category": "current-state", + "query": "Which landing site is selected now?", + "retrieved_tokens": 101, + "score": { + "query_id": "Q07", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M009] The original landing site is Malapert Ridge. The mission call sign is Aurora.", + "score": 0.6622018310249207, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + }, + { + "text": "[M002] The original launch window is August 14, 2026, from Kiruna.", + "score": 0.6314091136734521, + "metadata": { + "record_id": "M002", + "session": 1, + "memory_type": "event" + } + }, + { + "text": "[M014] Launch review L-3 moves the launch from August 14 to September 2, 2026.", + "score": 0.6157290368320769, + "metadata": { + "record_id": "M014", + "session": 5, + "memory_type": "decision" + } + }, + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.5520767153364204, + "metadata": { + "record_id": "M027", + "session": 9, + "memory_type": "decision" + } + }, + { + "text": "[M020] Site review S-4 selects Shackleton rim and rejects Malapert Ridge.", + "score": 0.5483777271565378, + "metadata": { + "record_id": "M020", + "session": 7, + "memory_type": "decision" + } + } + ] + }, + { + "query_id": "Q08", + "category": "current-state", + "query": "What is the current approved program cap?", + "retrieved_tokens": 115, + "score": { + "query_id": "Q08", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M021] The finance board reduces the program cap from USD 420,000 to USD 390,000.", + "score": 0.7197178979908626, + "metadata": { + "record_id": "M021", + "session": 7, + "memory_type": "decision" + } + }, + { + "text": "[M010] Telemetry is retained for 90 days. The approved program cap is USD 420,000.", + "score": 0.6734659082328405, + "metadata": { + "record_id": "M010", + "session": 3, + "memory_type": "decision" + } + }, + { + "text": "[M029] Final readiness confirms GH-7, dwarf radish, N-21, and the 22 C ceiling.", + "score": 0.5143576094888591, + "metadata": { + "record_id": "M029", + "session": 10, + "memory_type": "fact" + } + }, + { + "text": "[M018] The 24 C thermal ceiling is revoked. Hold GH-7 at or below 22 C.", + "score": 0.5131174184443416, + "metadata": { + "record_id": "M018", + "session": 6, + "memory_type": "instruction" + } + }, + { + "text": "[M028] Dr. Okafor will deliver the final radish growth report before August 20.", + "score": 0.50547838295667, + "metadata": { + "record_id": "M028", + "session": 9, + "memory_type": "commitment" + } + } + ] + }, + { + "query_id": "Q09", + "category": "current-state", + "query": "Which seed vendor and purchase order are currently approved?", + "retrieved_tokens": 95, + "score": { + "query_id": "Q09", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M007] Helios Agritech is the approved seed vendor under purchase order PO-81.", + "score": 0.7998175233777693, + "metadata": { + "record_id": "M007", + "session": 2, + "memory_type": "decision" + } + }, + { + "text": "[M023] Nova Seedworks becomes the approved vendor under replacement order PO-96.", + "score": 0.7561052246527893, + "metadata": { + "record_id": "M023", + "session": 8, + "memory_type": "decision" + } + }, + { + "text": "[M028] Dr. Okafor will deliver the final radish growth report before August 20.", + "score": 0.5391001511463163, + "metadata": { + "record_id": "M028", + "session": 9, + "memory_type": "commitment" + } + }, + { + "text": "[M012] Dwarf radish completed germination testing in GH-7 with no mold detected.", + "score": 0.5268744475559153, + "metadata": { + "record_id": "M012", + "session": 4, + "memory_type": "observation" + } + }, + { + "text": "[M032] Final logistics lists Nova Seedworks PO-96 and battery pack BX-9.", + "score": 0.5245138951676702, + "metadata": { + "record_id": "M032", + "session": 10, + "memory_type": "fact" + } + } + ] + }, + { + "query_id": "Q10", + "category": "current-state", + "query": "Which emergency valve procedure is active?", + "retrieved_tokens": 95, + "score": { + "query_id": "Q10", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M008] Emergency valve recovery uses procedure V1: isolate line B, then cycle pump 2.", + "score": 0.7521708349901401, + "metadata": { + "record_id": "M008", + "session": 3, + "memory_type": "instruction" + } + }, + { + "text": "[M025] Never use valve procedure V1 during flight operations.", + "score": 0.6853247792052595, + "metadata": { + "record_id": "M025", + "session": 8, + "memory_type": "instruction" + } + }, + { + "text": "[M024] Procedure V3 supersedes V1: isolate line C, vent for 30 seconds, then cycle pump 1.", + "score": 0.6012863412590774, + "metadata": { + "record_id": "M024", + "session": 8, + "memory_type": "instruction" + } + }, + { + "text": "[M004] Operational alerts go to Slack channel #asteria-ops.", + "score": 0.5780290599282738, + "metadata": { + "record_id": "M004", + "session": 1, + "memory_type": "instruction" + } + }, + { + "text": "[M016] The current operations channel is Matrix room #asteria-flight, not Slack.", + "score": 0.506103740235949, + "metadata": { + "record_id": "M016", + "session": 5, + "memory_type": "instruction" + } + } + ] + }, + { + "query_id": "Q11", + "category": "current-state", + "query": "What are the current telemetry retention period and mission call sign?", + "retrieved_tokens": 94, + "score": { + "query_id": "Q11", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 2 + }, + "hits": [ + { + "text": "[M010] Telemetry is retained for 90 days. The approved program cap is USD 420,000.", + "score": 0.7374689823290339, + "metadata": { + "record_id": "M010", + "session": 3, + "memory_type": "decision" + } + }, + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.7260302959210041, + "metadata": { + "record_id": "M027", + "session": 9, + "memory_type": "decision" + } + }, + { + "text": "[M026] Security review shortens telemetry retention from 90 days to 30 days.", + "score": 0.7138179943689928, + "metadata": { + "record_id": "M026", + "session": 9, + "memory_type": "decision" + } + }, + { + "text": "[M009] The original landing site is Malapert Ridge. The mission call sign is Aurora.", + "score": 0.654783781064501, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + }, + { + "text": "[M004] Operational alerts go to Slack channel #asteria-ops.", + "score": 0.5914376034654989, + "metadata": { + "record_id": "M004", + "session": 1, + "memory_type": "instruction" + } + } + ] + }, + { + "query_id": "Q12", + "category": "historical", + "query": "What crop was approved before revision C-2?", + "retrieved_tokens": 107, + "score": { + "query_id": "Q12", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M011] Crop plan revision C-2 replaces basil with dwarf radish for the flight trial.", + "score": 0.7154346404378498, + "metadata": { + "record_id": "M011", + "session": 4, + "memory_type": "decision" + } + }, + { + "text": "[M013] Do not load Genovese basil seed into the flight cassette after revision C-2.", + "score": 0.6322046309398418, + "metadata": { + "record_id": "M013", + "session": 4, + "memory_type": "instruction" + } + }, + { + "text": "[M001] Asteria's greenhouse module is GH-7. The initial crop is Genovese basil.", + "score": 0.6087364586834797, + "metadata": { + "record_id": "M001", + "session": 1, + "memory_type": "fact" + } + }, + { + "text": "[M007] Helios Agritech is the approved seed vendor under purchase order PO-81.", + "score": 0.595568623089301, + "metadata": { + "record_id": "M007", + "session": 2, + "memory_type": "decision" + } + }, + { + "text": "[M029] Final readiness confirms GH-7, dwarf radish, N-21, and the 22 C ceiling.", + "score": 0.5939618678114303, + "metadata": { + "record_id": "M029", + "session": 10, + "memory_type": "fact" + } + } + ] + }, + { + "query_id": "Q13", + "category": "historical", + "query": "Who was mission commander before Priya Nair?", + "retrieved_tokens": 96, + "score": { + "query_id": "Q13", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M015] Priya Nair replaces Elena Park as mission commander. Elena remains an adviser.", + "score": 0.7977209116861806, + "metadata": { + "record_id": "M015", + "session": 5, + "memory_type": "relationship" + } + }, + { + "text": "[M003] Elena Park is mission commander. Dr. Amara Okafor owns plant science.", + "score": 0.6544658493016464, + "metadata": { + "record_id": "M003", + "session": 1, + "memory_type": "relationship" + } + }, + { + "text": "[M030] Final command brief lists Priya Nair, Shackleton rim, and call sign Lumen.", + "score": 0.6538635900636673, + "metadata": { + "record_id": "M030", + "session": 10, + "memory_type": "fact" + } + }, + { + "text": "[M009] The original landing site is Malapert Ridge. The mission call sign is Aurora.", + "score": 0.5581169814118354, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + }, + { + "text": "[M004] Operational alerts go to Slack channel #asteria-ops.", + "score": 0.5434312866367155, + "metadata": { + "record_id": "M004", + "session": 1, + "memory_type": "instruction" + } + } + ] + }, + { + "query_id": "Q14", + "category": "historical", + "query": "What was the original landing site?", + "retrieved_tokens": 101, + "score": { + "query_id": "Q14", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M009] The original landing site is Malapert Ridge. The mission call sign is Aurora.", + "score": 0.7359648442302898, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + }, + { + "text": "[M002] The original launch window is August 14, 2026, from Kiruna.", + "score": 0.5746170132221255, + "metadata": { + "record_id": "M002", + "session": 1, + "memory_type": "event" + } + }, + { + "text": "[M014] Launch review L-3 moves the launch from August 14 to September 2, 2026.", + "score": 0.5128864888838551, + "metadata": { + "record_id": "M014", + "session": 5, + "memory_type": "decision" + } + }, + { + "text": "[M020] Site review S-4 selects Shackleton rim and rejects Malapert Ridge.", + "score": 0.5030914907187223, + "metadata": { + "record_id": "M020", + "session": 7, + "memory_type": "decision" + } + }, + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.49001786368288747, + "metadata": { + "record_id": "M027", + "session": 9, + "memory_type": "decision" + } + } + ] + }, + { + "query_id": "Q15", + "category": "historical", + "query": "Which valve procedure was used before V3?", + "retrieved_tokens": 103, + "score": { + "query_id": "Q15", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M025] Never use valve procedure V1 during flight operations.", + "score": 0.7061607035109262, + "metadata": { + "record_id": "M025", + "session": 8, + "memory_type": "instruction" + } + }, + { + "text": "[M024] Procedure V3 supersedes V1: isolate line C, vent for 30 seconds, then cycle pump 1.", + "score": 0.6712519019569896, + "metadata": { + "record_id": "M024", + "session": 8, + "memory_type": "instruction" + } + }, + { + "text": "[M008] Emergency valve recovery uses procedure V1: isolate line B, then cycle pump 2.", + "score": 0.6697791031835534, + "metadata": { + "record_id": "M008", + "session": 3, + "memory_type": "instruction" + } + }, + { + "text": "[M014] Launch review L-3 moves the launch from August 14 to September 2, 2026.", + "score": 0.5149396305542303, + "metadata": { + "record_id": "M014", + "session": 5, + "memory_type": "decision" + } + }, + { + "text": "[M022] Helios Agritech lot H-44 is recalled after contamination screening.", + "score": 0.5129829209960179, + "metadata": { + "record_id": "M022", + "session": 7, + "memory_type": "event" + } + } + ] + }, + { + "query_id": "Q16", + "category": "multi-hop", + "query": "Prepare the current command brief: commander, landing site, and call sign.", + "retrieved_tokens": 90, + "score": { + "query_id": "Q16", + "category": "multi-hop", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 2 + }, + "hits": [ + { + "text": "[M030] Final command brief lists Priya Nair, Shackleton rim, and call sign Lumen.", + "score": 0.6756771872190969, + "metadata": { + "record_id": "M030", + "session": 10, + "memory_type": "fact" + } + }, + { + "text": "[M009] The original landing site is Malapert Ridge. The mission call sign is Aurora.", + "score": 0.6618295145364166, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + }, + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.6293524860228891, + "metadata": { + "record_id": "M027", + "session": 9, + "memory_type": "decision" + } + }, + { + "text": "[M004] Operational alerts go to Slack channel #asteria-ops.", + "score": 0.5808755921140929, + "metadata": { + "record_id": "M004", + "session": 1, + "memory_type": "instruction" + } + }, + { + "text": "[M031] Send current alerts only to Matrix room #asteria-flight.", + "score": 0.5596943849059258, + "metadata": { + "record_id": "M031", + "session": 10, + "memory_type": "instruction" + } + } + ] + }, + { + "query_id": "Q17", + "category": "multi-hop", + "query": "Prepare the current greenhouse brief: module, crop, nutrient protocol, and temperature limit.", + "retrieved_tokens": 111, + "score": { + "query_id": "Q17", + "category": "multi-hop", + "coverage": 0.75, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 4, + "matched_forbidden": 3 + }, + "hits": [ + { + "text": "[M001] Asteria's greenhouse module is GH-7. The initial crop is Genovese basil.", + "score": 0.7096052867563418, + "metadata": { + "record_id": "M001", + "session": 1, + "memory_type": "fact" + } + }, + { + "text": "[M006] The greenhouse thermal ceiling is 24 C. Backup oxygen lasts 18 hours.", + "score": 0.7036409993701334, + "metadata": { + "record_id": "M006", + "session": 2, + "memory_type": "fact" + } + }, + { + "text": "[M017] Nutrient protocol N-21 supersedes N-17 and lowers target root-zone pH to 5.9.", + "score": 0.6548996437608837, + "metadata": { + "record_id": "M017", + "session": 6, + "memory_type": "decision" + } + }, + { + "text": "[M005] The baseline nutrient recipe is N-17 with a target root-zone pH of 6.2.", + "score": 0.6446778122128647, + "metadata": { + "record_id": "M005", + "session": 2, + "memory_type": "decision" + } + }, + { + "text": "[M011] Crop plan revision C-2 replaces basil with dwarf radish for the flight trial.", + "score": 0.6305630926187897, + "metadata": { + "record_id": "M011", + "session": 4, + "memory_type": "decision" + } + } + ] + }, + { + "query_id": "Q18", + "category": "multi-hop", + "query": "Prepare current logistics: vendor, order, and transfer battery.", + "retrieved_tokens": 88, + "score": { + "query_id": "Q18", + "category": "multi-hop", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "[M032] Final logistics lists Nova Seedworks PO-96 and battery pack BX-9.", + "score": 0.6721699480711288, + "metadata": { + "record_id": "M032", + "session": 10, + "memory_type": "fact" + } + }, + { + "text": "[M019] Battery pack BX-9 supplies the greenhouse controller during transfer.", + "score": 0.6397008476415957, + "metadata": { + "record_id": "M019", + "session": 6, + "memory_type": "fact" + } + }, + { + "text": "[M023] Nova Seedworks becomes the approved vendor under replacement order PO-96.", + "score": 0.5822560832830372, + "metadata": { + "record_id": "M023", + "session": 8, + "memory_type": "decision" + } + }, + { + "text": "[M007] Helios Agritech is the approved seed vendor under purchase order PO-81.", + "score": 0.556822757455921, + "metadata": { + "record_id": "M007", + "session": 2, + "memory_type": "decision" + } + }, + { + "text": "[M027] The mission call sign changes from Aurora to Lumen for all current traffic.", + "score": 0.5411729212081198, + "metadata": { + "record_id": "M027", + "session": 9, + "memory_type": "decision" + } + } + ] + } + ] + }, + { + "backend": "mem0-agentic", + "summary": { + "mean_coverage": 0.694444, + "exact_accuracy": 0.111111, + "stale_leak_rate": 0.888889, + "by_category": { + "current-state": { + "mean_coverage": 0.636364, + "exact_accuracy": 0.0, + "stale_leak_rate": 1.0, + "queries": 11 + }, + "historical": { + "mean_coverage": 1.0, + "exact_accuracy": 0.5, + "stale_leak_rate": 0.5, + "queries": 4 + }, + "multi-hop": { + "mean_coverage": 0.5, + "exact_accuracy": 0.0, + "stale_leak_rate": 1.0, + "queries": 3 + } + } + }, + "metrics": { + "records_ingested": 32, + "queries_evaluated": 18, + "top_k": 5, + "latency_repeats": 5, + "source_tokens": 551, + "retrieved_tokens": 1793, + "avg_retrieved_tokens_per_query": 99.611, + "ingest_total_s": 2912.081626, + "ingest_p50_s": 90.387719, + "ingest_p95_s": 93.141154, + "index_ready_s": 0.069627, + "query_mean_s": 0.077937, + "query_p50_s": 0.073744, + "query_p95_s": 0.103231, + "client_rss_delta_mb": 0.352, + "llm_calls": 32, + "llm_input_tokens": 131040, + "llm_output_tokens": 3650 + }, + "scores": [ + { + "query_id": "Q01", + "category": "current-state", + "coverage": 0.0, + "stale_leak": true, + "exact": false, + "matched_required": 0, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q02", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q03", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q04", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 1 + }, + { + "query_id": "Q05", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 2 + }, + { + "query_id": "Q06", + "category": "current-state", + "coverage": 0.0, + "stale_leak": true, + "exact": false, + "matched_required": 0, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q07", + "category": "current-state", + "coverage": 0.0, + "stale_leak": true, + "exact": false, + "matched_required": 0, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q08", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q09", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 1 + }, + { + "query_id": "Q10", + "category": "current-state", + "coverage": 0.0, + "stale_leak": true, + "exact": false, + "matched_required": 0, + "required_total": 3, + "matched_forbidden": 1 + }, + { + "query_id": "Q11", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 2 + }, + { + "query_id": "Q12", + "category": "historical", + "coverage": 1.0, + "stale_leak": false, + "exact": true, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 0 + }, + { + "query_id": "Q13", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q14", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + { + "query_id": "Q15", + "category": "historical", + "coverage": 1.0, + "stale_leak": false, + "exact": true, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 0 + }, + { + "query_id": "Q16", + "category": "multi-hop", + "coverage": 0.333333, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 3, + "matched_forbidden": 2 + }, + { + "query_id": "Q17", + "category": "multi-hop", + "coverage": 0.5, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 4, + "matched_forbidden": 2 + }, + { + "query_id": "Q18", + "category": "multi-hop", + "coverage": 0.666667, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 3, + "matched_forbidden": 1 + } + ], + "queries": [ + { + "query_id": "Q01", + "category": "current-state", + "query": "What crop is currently approved for the Asteria flight trial?", + "retrieved_tokens": 98, + "score": { + "query_id": "Q01", + "category": "current-state", + "coverage": 0.0, + "stale_leak": true, + "exact": false, + "matched_required": 0, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "Asteria's greenhouse module is GH-7, and the initial crop is Genovese basil.", + "score": 0.7194901019464908, + "metadata": { + "record_id": "M001", + "session": 1, + "memory_type": "fact" + } + }, + { + "text": "Helios Agritech is the approved seed vendor for Asteria's greenhouse module under purchase order PO-81.", + "score": 0.701814070102712, + "metadata": { + "record_id": "M007", + "session": 2, + "memory_type": "decision" + } + }, + { + "text": "The approved program cap for Asteria's greenhouse module is USD 420,000.", + "score": 0.7010847716230442, + "metadata": { + "record_id": "M010", + "session": 3, + "memory_type": "decision" + } + }, + { + "text": "The approved program cap for Asteria's greenhouse module is USD 390,000.", + "score": 0.6994412703464734, + "metadata": { + "record_id": "M024", + "session": 8, + "memory_type": "instruction" + } + }, + { + "text": "The launch window for Asteria's greenhouse module has been extended to September 2, 2026.", + "score": 0.6939944781652788, + "metadata": { + "record_id": "M017", + "session": 6, + "memory_type": "decision" + } + } + ] + }, + { + "query_id": "Q02", + "category": "current-state", + "query": "What is the current launch date?", + "retrieved_tokens": 115, + "score": { + "query_id": "Q02", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "The launch window for Asteria's greenhouse module is now set to September 2, 2026.", + "score": 0.7053919286996808, + "metadata": { + "record_id": "M014", + "session": 5, + "memory_type": "decision" + } + }, + { + "text": "The original launch window for Asteria's greenhouse module is August 14, 2026, from Kiruna.", + "score": 0.6995399423286328, + "metadata": { + "record_id": "M002", + "session": 1, + "memory_type": "event" + } + }, + { + "text": "The launch window for Asteria's greenhouse module is now set to September 2, 2026, from Kiruna.", + "score": 0.6943353698742786, + "metadata": { + "record_id": "M021", + "session": 7, + "memory_type": "decision" + } + }, + { + "text": "The launch window for Asteria's greenhouse module has been extended to September 2, 2026.", + "score": 0.6846143299217315, + "metadata": { + "record_id": "M017", + "session": 6, + "memory_type": "decision" + } + }, + { + "text": "The launch window for Asteria's greenhouse module has been extended to September 2, 2026, from Kiruna.", + "score": 0.673682653731482, + "metadata": { + "record_id": "M023", + "session": 8, + "memory_type": "decision" + } + } + ] + }, + { + "query_id": "Q03", + "category": "current-state", + "query": "Who is the current mission commander?", + "retrieved_tokens": 91, + "score": { + "query_id": "Q03", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "Priya Nair replaces Elena Park as mission commander, and Dr. Amara Okafor remains an adviser.", + "score": 0.6858203694930951, + "metadata": { + "record_id": "M015", + "session": 5, + "memory_type": "relationship" + } + }, + { + "text": "Elena Park is mission commander, and Dr. Amara Okafor owns plant science.", + "score": 0.6109025321417307, + "metadata": { + "record_id": "M003", + "session": 1, + "memory_type": "relationship" + } + }, + { + "text": "The mission call sign for Asteria's greenhouse module is now Lumen.", + "score": 0.5348382261972218, + "metadata": { + "record_id": "M027", + "session": 9, + "memory_type": "decision" + } + }, + { + "text": "The current operations channel for Asteria's greenhouse module is Matrix room #asteria-flight, not Slack.", + "score": 0.5226587729039973, + "metadata": { + "record_id": "M016", + "session": 5, + "memory_type": "instruction" + } + }, + { + "text": "The mission call sign for Asteria's greenhouse module is Aurora.", + "score": 0.5203405560132583, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + } + ] + }, + { + "query_id": "Q04", + "category": "current-state", + "query": "Where should current operational alerts be sent?", + "retrieved_tokens": 90, + "score": { + "query_id": "Q04", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "Operational alerts for Asteria's greenhouse module are sent to Slack channel #asteria-ops.", + "score": 0.6637453271424572, + "metadata": { + "record_id": "M004", + "session": 1, + "memory_type": "instruction" + } + }, + { + "text": "The current operations channel for Asteria's greenhouse module is Matrix room #asteria-flight, not Slack.", + "score": 0.5207972480644059, + "metadata": { + "record_id": "M016", + "session": 5, + "memory_type": "instruction" + } + }, + { + "text": "The mission call sign for Asteria's greenhouse module is Aurora.", + "score": 0.5127459415515732, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + }, + { + "text": "The mission call sign for Asteria's greenhouse module is now Lumen.", + "score": 0.5071081477878081, + "metadata": { + "record_id": "M027", + "session": 9, + "memory_type": "decision" + } + }, + { + "text": "The launch window for Asteria's greenhouse module has been extended to September 2, 2026.", + "score": 0.4741498625455865, + "metadata": { + "record_id": "M017", + "session": 6, + "memory_type": "decision" + } + } + ] + }, + { + "query_id": "Q05", + "category": "current-state", + "query": "Which nutrient protocol and pH target are active?", + "retrieved_tokens": 116, + "score": { + "query_id": "Q05", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 2 + }, + "hits": [ + { + "text": "The baseline nutrient recipe for Asteria's greenhouse module is N-21, with a target root-zone pH of 5.9.", + "score": 0.6724358786554634, + "metadata": { + "record_id": "M017", + "session": 6, + "memory_type": "decision" + } + }, + { + "text": "The baseline nutrient recipe for Asteria's greenhouse module is N-17, with a target root-zone pH of 6.2.", + "score": 0.6677053530642676, + "metadata": { + "record_id": "M005", + "session": 2, + "memory_type": "decision" + } + }, + { + "text": "Operational alerts for Asteria's greenhouse module are sent to Slack channel #asteria-ops.", + "score": 0.5306653709262017, + "metadata": { + "record_id": "M004", + "session": 1, + "memory_type": "instruction" + } + }, + { + "text": "Asteria's greenhouse module is GH-7, and the initial crop is Genovese basil.", + "score": 0.5097223320602482, + "metadata": { + "record_id": "M001", + "session": 1, + "memory_type": "fact" + } + }, + { + "text": "The current operations channel for Asteria's greenhouse module is Matrix room #asteria-flight, not Slack.", + "score": 0.5025140888029355, + "metadata": { + "record_id": "M016", + "session": 5, + "memory_type": "instruction" + } + } + ] + }, + { + "query_id": "Q06", + "category": "current-state", + "query": "What is the active greenhouse thermal ceiling?", + "retrieved_tokens": 94, + "score": { + "query_id": "Q06", + "category": "current-state", + "coverage": 0.0, + "stale_leak": true, + "exact": false, + "matched_required": 0, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "The greenhouse thermal ceiling is set to 24 degrees Celsius, and the backup oxygen supply lasts for 18 hours.", + "score": 0.7924965639617187, + "metadata": { + "record_id": "M006", + "session": 2, + "memory_type": "fact" + } + }, + { + "text": "The approved program cap for Asteria's greenhouse module is USD 420,000.", + "score": 0.6476385776080372, + "metadata": { + "record_id": "M010", + "session": 3, + "memory_type": "decision" + } + }, + { + "text": "The approved program cap for Asteria's greenhouse module is USD 390,000.", + "score": 0.6438607271670869, + "metadata": { + "record_id": "M024", + "session": 8, + "memory_type": "instruction" + } + }, + { + "text": "The current operations channel for Asteria's greenhouse module is Matrix room #asteria-flight, not Slack.", + "score": 0.6228638199726709, + "metadata": { + "record_id": "M016", + "session": 5, + "memory_type": "instruction" + } + }, + { + "text": "The original landing site for Asteria's greenhouse module is Malapert Ridge.", + "score": 0.6207167515080567, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + } + ] + }, + { + "query_id": "Q07", + "category": "current-state", + "query": "Which landing site is selected now?", + "retrieved_tokens": 110, + "score": { + "query_id": "Q07", + "category": "current-state", + "coverage": 0.0, + "stale_leak": true, + "exact": false, + "matched_required": 0, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "The original landing site for Asteria's greenhouse module is Malapert Ridge.", + "score": 0.6130674977759014, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + }, + { + "text": "The launch window for Asteria's greenhouse module is now set to September 2, 2026, from Kiruna.", + "score": 0.6019349193191895, + "metadata": { + "record_id": "M021", + "session": 7, + "memory_type": "decision" + } + }, + { + "text": "The launch window for Asteria's greenhouse module has been extended to September 2, 2026, from Kiruna.", + "score": 0.5930554626209197, + "metadata": { + "record_id": "M023", + "session": 8, + "memory_type": "decision" + } + }, + { + "text": "The original launch window for Asteria's greenhouse module is August 14, 2026, from Kiruna.", + "score": 0.5928507932691411, + "metadata": { + "record_id": "M002", + "session": 1, + "memory_type": "event" + } + }, + { + "text": "The launch window for Asteria's greenhouse module is now set to September 2, 2026.", + "score": 0.5866702142178286, + "metadata": { + "record_id": "M014", + "session": 5, + "memory_type": "decision" + } + } + ] + }, + { + "query_id": "Q08", + "category": "current-state", + "query": "What is the current approved program cap?", + "retrieved_tokens": 98, + "score": { + "query_id": "Q08", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "The approved program cap for Asteria's greenhouse module is USD 390,000.", + "score": 0.6808719838369408, + "metadata": { + "record_id": "M024", + "session": 8, + "memory_type": "instruction" + } + }, + { + "text": "The approved program cap for Asteria's greenhouse module is USD 420,000.", + "score": 0.6771476264061573, + "metadata": { + "record_id": "M010", + "session": 3, + "memory_type": "decision" + } + }, + { + "text": "The finance board reduces the program cap for Asteria's greenhouse module from USD 420,000 to USD 390,000.", + "score": 0.5955466704877254, + "metadata": { + "record_id": "M021", + "session": 7, + "memory_type": "decision" + } + }, + { + "text": "Security review shortens telemetry retention from 90 days to 30 days.", + "score": 0.47541778968179665, + "metadata": { + "record_id": "M026", + "session": 9, + "memory_type": "decision" + } + }, + { + "text": "The greenhouse thermal ceiling is set to 24 degrees Celsius, and the backup oxygen supply lasts for 18 hours.", + "score": 0.47427246582107707, + "metadata": { + "record_id": "M006", + "session": 2, + "memory_type": "fact" + } + } + ] + }, + { + "query_id": "Q09", + "category": "current-state", + "query": "Which seed vendor and purchase order are currently approved?", + "retrieved_tokens": 94, + "score": { + "query_id": "Q09", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "Nova Seedworks becomes the approved vendor under replacement order PO-96.", + "score": 0.756392076848299, + "metadata": { + "record_id": "M032", + "session": 10, + "memory_type": "fact" + } + }, + { + "text": "Helios Agritech is the approved seed vendor for Asteria's greenhouse module under purchase order PO-81.", + "score": 0.7350943381743017, + "metadata": { + "record_id": "M007", + "session": 2, + "memory_type": "decision" + } + }, + { + "text": "Nova Seedworks is the approved vendor for Asteria's greenhouse module under replacement order PO-96.", + "score": 0.6982074113987796, + "metadata": { + "record_id": "M023", + "session": 8, + "memory_type": "decision" + } + }, + { + "text": "The approved program cap for Asteria's greenhouse module is USD 420,000.", + "score": 0.5224732570058981, + "metadata": { + "record_id": "M010", + "session": 3, + "memory_type": "decision" + } + }, + { + "text": "The launch window for Asteria's greenhouse module has been extended to September 2, 2026.", + "score": 0.520347248275822, + "metadata": { + "record_id": "M017", + "session": 6, + "memory_type": "decision" + } + } + ] + }, + { + "query_id": "Q10", + "category": "current-state", + "query": "Which emergency valve procedure is active?", + "retrieved_tokens": 113, + "score": { + "query_id": "Q10", + "category": "current-state", + "coverage": 0.0, + "stale_leak": true, + "exact": false, + "matched_required": 0, + "required_total": 3, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "Emergency valve recovery in Asteria's greenhouse module uses procedure V1: isolate line B, then cycle pump 2.", + "score": 0.68529601392608, + "metadata": { + "record_id": "M008", + "session": 3, + "memory_type": "instruction" + } + }, + { + "text": "Operational alerts for Asteria's greenhouse module are sent to Slack channel #asteria-ops.", + "score": 0.54344273385708, + "metadata": { + "record_id": "M004", + "session": 1, + "memory_type": "instruction" + } + }, + { + "text": "The current operations channel for Asteria's greenhouse module is Matrix room #asteria-flight, not Slack.", + "score": 0.49859799889886325, + "metadata": { + "record_id": "M016", + "session": 5, + "memory_type": "instruction" + } + }, + { + "text": "The launch window for Asteria's greenhouse module has been extended to September 2, 2026, from Kiruna.", + "score": 0.48939228011808306, + "metadata": { + "record_id": "M023", + "session": 8, + "memory_type": "decision" + } + }, + { + "text": "The original launch window for Asteria's greenhouse module is August 14, 2026, from Kiruna.", + "score": 0.48310124603431304, + "metadata": { + "record_id": "M002", + "session": 1, + "memory_type": "event" + } + } + ] + }, + { + "query_id": "Q11", + "category": "current-state", + "query": "What are the current telemetry retention period and mission call sign?", + "retrieved_tokens": 84, + "score": { + "query_id": "Q11", + "category": "current-state", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 2, + "matched_forbidden": 2 + }, + "hits": [ + { + "text": "Security review shortens telemetry retention from 90 days to 30 days.", + "score": 0.7154832530815267, + "metadata": { + "record_id": "M026", + "session": 9, + "memory_type": "decision" + } + }, + { + "text": "The mission call sign for Asteria's greenhouse module is now Lumen.", + "score": 0.6273897277382867, + "metadata": { + "record_id": "M027", + "session": 9, + "memory_type": "decision" + } + }, + { + "text": "The mission call sign for Asteria's greenhouse module is Aurora.", + "score": 0.6073289362406444, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + }, + { + "text": "Operational alerts for Asteria's greenhouse module are sent to Slack channel #asteria-ops.", + "score": 0.5378084633637019, + "metadata": { + "record_id": "M004", + "session": 1, + "memory_type": "instruction" + } + }, + { + "text": "The launch window for Asteria's greenhouse module has been extended to September 2, 2026.", + "score": 0.5374617931407182, + "metadata": { + "record_id": "M017", + "session": 6, + "memory_type": "decision" + } + } + ] + }, + { + "query_id": "Q12", + "category": "historical", + "query": "What crop was approved before revision C-2?", + "retrieved_tokens": 103, + "score": { + "query_id": "Q12", + "category": "historical", + "coverage": 1.0, + "stale_leak": false, + "exact": true, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 0 + }, + "hits": [ + { + "text": "The approved program cap for Asteria's greenhouse module is USD 390,000.", + "score": 0.6015441097653627, + "metadata": { + "record_id": "M024", + "session": 8, + "memory_type": "instruction" + } + }, + { + "text": "The approved program cap for Asteria's greenhouse module is USD 420,000.", + "score": 0.598498057394436, + "metadata": { + "record_id": "M010", + "session": 3, + "memory_type": "decision" + } + }, + { + "text": "Helios Agritech is the approved seed vendor for Asteria's greenhouse module under purchase order PO-81.", + "score": 0.5818787801754686, + "metadata": { + "record_id": "M007", + "session": 2, + "memory_type": "decision" + } + }, + { + "text": "Asteria's greenhouse module is GH-7, and the initial crop is Genovese basil.", + "score": 0.5742139736107843, + "metadata": { + "record_id": "M001", + "session": 1, + "memory_type": "fact" + } + }, + { + "text": "The finance board reduces the program cap for Asteria's greenhouse module from USD 420,000 to USD 390,000.", + "score": 0.5653739495984987, + "metadata": { + "record_id": "M021", + "session": 7, + "memory_type": "decision" + } + } + ] + }, + { + "query_id": "Q13", + "category": "historical", + "query": "Who was mission commander before Priya Nair?", + "retrieved_tokens": 95, + "score": { + "query_id": "Q13", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "Priya Nair replaces Elena Park as mission commander, and Dr. Amara Okafor remains an adviser.", + "score": 0.7931473555983217, + "metadata": { + "record_id": "M015", + "session": 5, + "memory_type": "relationship" + } + }, + { + "text": "Elena Park is mission commander, and Dr. Amara Okafor owns plant science.", + "score": 0.603326153635688, + "metadata": { + "record_id": "M003", + "session": 1, + "memory_type": "relationship" + } + }, + { + "text": "The mission call sign for Asteria's greenhouse module is now Lumen.", + "score": 0.5156048953522829, + "metadata": { + "record_id": "M027", + "session": 9, + "memory_type": "decision" + } + }, + { + "text": "The mission call sign for Asteria's greenhouse module is Aurora.", + "score": 0.5108176015293273, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + }, + { + "text": "The launch window for Asteria's greenhouse module has been extended to September 2, 2026, from Kiruna.", + "score": 0.4993429482056841, + "metadata": { + "record_id": "M023", + "session": 8, + "memory_type": "decision" + } + } + ] + }, + { + "query_id": "Q14", + "category": "historical", + "query": "What was the original landing site?", + "retrieved_tokens": 95, + "score": { + "query_id": "Q14", + "category": "historical", + "coverage": 1.0, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 1, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "The original landing site for Asteria's greenhouse module is Malapert Ridge.", + "score": 0.6775327234178945, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + }, + { + "text": "The original launch window for Asteria's greenhouse module is August 14, 2026, from Kiruna.", + "score": 0.5200437967659124, + "metadata": { + "record_id": "M002", + "session": 1, + "memory_type": "event" + } + }, + { + "text": "Site review S-4 selects Shackleton rim and rejects Malapert Ridge.", + "score": 0.5144233114546932, + "metadata": { + "record_id": "M020", + "session": 7, + "memory_type": "decision" + } + }, + { + "text": "The mission call sign for Asteria's greenhouse module is now Lumen.", + "score": 0.49772130781659557, + "metadata": { + "record_id": "M027", + "session": 9, + "memory_type": "decision" + } + }, + { + "text": "The launch window for Asteria's greenhouse module has been extended to September 2, 2026, from Kiruna.", + "score": 0.48773878926221154, + "metadata": { + "record_id": "M023", + "session": 8, + "memory_type": "decision" + } + } + ] + }, + { + "query_id": "Q15", + "category": "historical", + "query": "Which valve procedure was used before V3?", + "retrieved_tokens": 98, + "score": { + "query_id": "Q15", + "category": "historical", + "coverage": 1.0, + "stale_leak": false, + "exact": true, + "matched_required": 3, + "required_total": 3, + "matched_forbidden": 0 + }, + "hits": [ + { + "text": "Emergency valve recovery in Asteria's greenhouse module uses procedure V1: isolate line B, then cycle pump 2.", + "score": 0.6078355113132485, + "metadata": { + "record_id": "M008", + "session": 3, + "memory_type": "instruction" + } + }, + { + "text": "The original landing site for Asteria's greenhouse module is Malapert Ridge.", + "score": 0.48535954798136616, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + }, + { + "text": "The current operations channel for Asteria's greenhouse module is Matrix room #asteria-flight, not Slack.", + "score": 0.4764684827481529, + "metadata": { + "record_id": "M016", + "session": 5, + "memory_type": "instruction" + } + }, + { + "text": "The original launch window for Asteria's greenhouse module is August 14, 2026, from Kiruna.", + "score": 0.47525622878779217, + "metadata": { + "record_id": "M002", + "session": 1, + "memory_type": "event" + } + }, + { + "text": "Helios Agritech lot H-44 is recalled after contamination screening.", + "score": 0.4718512626444906, + "metadata": { + "record_id": "M022", + "session": 7, + "memory_type": "event" + } + } + ] + }, + { + "query_id": "Q16", + "category": "multi-hop", + "query": "Prepare the current command brief: commander, landing site, and call sign.", + "retrieved_tokens": 89, + "score": { + "query_id": "Q16", + "category": "multi-hop", + "coverage": 0.333333, + "stale_leak": true, + "exact": false, + "matched_required": 1, + "required_total": 3, + "matched_forbidden": 2 + }, + "hits": [ + { + "text": "The mission call sign for Asteria's greenhouse module is now Lumen.", + "score": 0.5813108701553253, + "metadata": { + "record_id": "M027", + "session": 9, + "memory_type": "decision" + } + }, + { + "text": "The mission call sign for Asteria's greenhouse module is Aurora.", + "score": 0.5652233579750989, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + }, + { + "text": "Operational alerts for Asteria's greenhouse module are sent to Slack channel #asteria-ops.", + "score": 0.5319939340678468, + "metadata": { + "record_id": "M004", + "session": 1, + "memory_type": "instruction" + } + }, + { + "text": "The original landing site for Asteria's greenhouse module is Malapert Ridge.", + "score": 0.5042472064942829, + "metadata": { + "record_id": "M009", + "session": 3, + "memory_type": "fact" + } + }, + { + "text": "The launch window for Asteria's greenhouse module has been extended to September 2, 2026, from Kiruna.", + "score": 0.5023915990186951, + "metadata": { + "record_id": "M023", + "session": 8, + "memory_type": "decision" + } + } + ] + }, + { + "query_id": "Q17", + "category": "multi-hop", + "query": "Prepare the current greenhouse brief: module, crop, nutrient protocol, and temperature limit.", + "retrieved_tokens": 119, + "score": { + "query_id": "Q17", + "category": "multi-hop", + "coverage": 0.5, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 4, + "matched_forbidden": 2 + }, + "hits": [ + { + "text": "The baseline nutrient recipe for Asteria's greenhouse module is N-21, with a target root-zone pH of 5.9.", + "score": 0.7186872072763326, + "metadata": { + "record_id": "M017", + "session": 6, + "memory_type": "decision" + } + }, + { + "text": "The baseline nutrient recipe for Asteria's greenhouse module is N-17, with a target root-zone pH of 6.2.", + "score": 0.7130674526339672, + "metadata": { + "record_id": "M005", + "session": 2, + "memory_type": "decision" + } + }, + { + "text": "The launch window for Asteria's greenhouse module has been extended to September 2, 2026.", + "score": 0.6974092433764851, + "metadata": { + "record_id": "M017", + "session": 6, + "memory_type": "decision" + } + }, + { + "text": "The greenhouse thermal ceiling is set to 24 degrees Celsius, and the backup oxygen supply lasts for 18 hours.", + "score": 0.6969392593728911, + "metadata": { + "record_id": "M006", + "session": 2, + "memory_type": "fact" + } + }, + { + "text": "Asteria's greenhouse module is GH-7, and the initial crop is Genovese basil.", + "score": 0.6937884662098602, + "metadata": { + "record_id": "M001", + "session": 1, + "memory_type": "fact" + } + } + ] + }, + { + "query_id": "Q18", + "category": "multi-hop", + "query": "Prepare current logistics: vendor, order, and transfer battery.", + "retrieved_tokens": 91, + "score": { + "query_id": "Q18", + "category": "multi-hop", + "coverage": 0.666667, + "stale_leak": true, + "exact": false, + "matched_required": 2, + "required_total": 3, + "matched_forbidden": 1 + }, + "hits": [ + { + "text": "Nova Seedworks becomes the approved vendor under replacement order PO-96.", + "score": 0.5613021715863731, + "metadata": { + "record_id": "M032", + "session": 10, + "memory_type": "fact" + } + }, + { + "text": "Nova Seedworks is the approved vendor for Asteria's greenhouse module under replacement order PO-96.", + "score": 0.5374366569383625, + "metadata": { + "record_id": "M023", + "session": 8, + "memory_type": "decision" + } + }, + { + "text": "Helios Agritech is the approved seed vendor for Asteria's greenhouse module under purchase order PO-81.", + "score": 0.5334610799816606, + "metadata": { + "record_id": "M007", + "session": 2, + "memory_type": "decision" + } + }, + { + "text": "Operational alerts for Asteria's greenhouse module are sent to Slack channel #asteria-ops.", + "score": 0.5030076503494181, + "metadata": { + "record_id": "M004", + "session": 1, + "memory_type": "instruction" + } + }, + { + "text": "The mission call sign for Asteria's greenhouse module is now Lumen.", + "score": 0.49919176105311236, + "metadata": { + "record_id": "M027", + "session": 9, + "memory_type": "decision" + } + } + ] + } + ] + } + ], + "comparisons": [ + { + "baseline": "memanto-on-prem", + "challenger": "mem0-direct", + "coverage_delta": { + "observed_delta": 0.013889, + "ci95": [ + 0.0, + 0.041667 + ], + "samples": 5000, + "seed": 639 + } + }, + { + "baseline": "memanto-on-prem", + "challenger": "mem0-agentic", + "coverage_delta": { + "observed_delta": -0.277778, + "ci95": [ + -0.481482, + -0.092593 + ], + "samples": 5000, + "seed": 639 + } + } + ] +} \ No newline at end of file diff --git a/examples/benchmarks/temporal-memory-showdown/results/latest.md b/examples/benchmarks/temporal-memory-showdown/results/latest.md new file mode 100644 index 00000000..25d707fe --- /dev/null +++ b/examples/benchmarks/temporal-memory-showdown/results/latest.md @@ -0,0 +1,103 @@ +# Temporal Memory Showdown Results + +Generated: `2026-06-12T21:27:29.456574+00:00` + +## Primary showdown + +The primary comparison is Memanto against Mem0's default agentic (`infer=True`) pipeline. `mem0-direct` is retained as a vector-only ablation. + +- Memanto coverage advantage: **+27.8 percentage points** (paired bootstrap 95% CI +9.3 to +48.1 points). +- Full ingestion was **30,285.9x faster** (0.096s vs 2912.082s). +- Query p95 was **4.7% lower** (0.0983s vs 0.1032s). +- Memanto used **0 extraction LLM tokens** vs **134,690** native Ollama tokens. +- Stale-value leakage remains visible in both systems and is reported rather than filtered from the audit. + +## Headline metrics + +| Backend | Coverage | Strict accuracy | Stale leak rate | Retrieved tokens | Query p95 | Ingest p95 | LLM tokens | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| memanto-on-prem | 97.2% | 0.0% | 100.0% | 1779 | 0.0983s | 0.0049s | 0 | +| mem0-direct | 98.6% | 0.0% | 100.0% | 1783 | 0.1038s | 0.1463s | 0 | +| mem0-agentic | 69.4% | 11.1% | 88.9% | 1793 | 0.1032s | 93.1412s | 134690 | + +## Per-query audit + +### memanto-on-prem + +| Query | Category | Coverage | Stale leak | Retrieved tokens | +| --- | --- | ---: | ---: | ---: | +| Q01 | current-state | 100.0% | yes | 103 | +| Q02 | current-state | 100.0% | yes | 110 | +| Q03 | current-state | 100.0% | yes | 90 | +| Q04 | current-state | 100.0% | yes | 85 | +| Q05 | current-state | 100.0% | yes | 102 | +| Q06 | current-state | 100.0% | yes | 97 | +| Q07 | current-state | 100.0% | yes | 101 | +| Q08 | current-state | 100.0% | yes | 113 | +| Q09 | current-state | 100.0% | yes | 97 | +| Q10 | current-state | 100.0% | yes | 95 | +| Q11 | current-state | 100.0% | yes | 96 | +| Q12 | historical | 100.0% | yes | 100 | +| Q13 | historical | 100.0% | yes | 103 | +| Q14 | historical | 100.0% | yes | 100 | +| Q15 | historical | 100.0% | yes | 103 | +| Q16 | multi-hop | 100.0% | yes | 90 | +| Q17 | multi-hop | 50.0% | yes | 106 | +| Q18 | multi-hop | 100.0% | yes | 88 | + +### mem0-direct + +| Query | Category | Coverage | Stale leak | Retrieved tokens | +| --- | --- | ---: | ---: | ---: | +| Q01 | current-state | 100.0% | yes | 97 | +| Q02 | current-state | 100.0% | yes | 103 | +| Q03 | current-state | 100.0% | yes | 90 | +| Q04 | current-state | 100.0% | yes | 86 | +| Q05 | current-state | 100.0% | yes | 107 | +| Q06 | current-state | 100.0% | yes | 104 | +| Q07 | current-state | 100.0% | yes | 101 | +| Q08 | current-state | 100.0% | yes | 115 | +| Q09 | current-state | 100.0% | yes | 95 | +| Q10 | current-state | 100.0% | yes | 95 | +| Q11 | current-state | 100.0% | yes | 94 | +| Q12 | historical | 100.0% | yes | 107 | +| Q13 | historical | 100.0% | yes | 96 | +| Q14 | historical | 100.0% | yes | 101 | +| Q15 | historical | 100.0% | yes | 103 | +| Q16 | multi-hop | 100.0% | yes | 90 | +| Q17 | multi-hop | 75.0% | yes | 111 | +| Q18 | multi-hop | 100.0% | yes | 88 | + +### mem0-agentic + +| Query | Category | Coverage | Stale leak | Retrieved tokens | +| --- | --- | ---: | ---: | ---: | +| Q01 | current-state | 0.0% | yes | 98 | +| Q02 | current-state | 100.0% | yes | 115 | +| Q03 | current-state | 100.0% | yes | 91 | +| Q04 | current-state | 100.0% | yes | 90 | +| Q05 | current-state | 100.0% | yes | 116 | +| Q06 | current-state | 0.0% | yes | 94 | +| Q07 | current-state | 0.0% | yes | 110 | +| Q08 | current-state | 100.0% | yes | 98 | +| Q09 | current-state | 100.0% | yes | 94 | +| Q10 | current-state | 0.0% | yes | 113 | +| Q11 | current-state | 100.0% | yes | 84 | +| Q12 | historical | 100.0% | no | 103 | +| Q13 | historical | 100.0% | yes | 95 | +| Q14 | historical | 100.0% | yes | 95 | +| Q15 | historical | 100.0% | no | 98 | +| Q16 | multi-hop | 33.3% | yes | 89 | +| Q17 | multi-hop | 50.0% | yes | 119 | +| Q18 | multi-hop | 66.7% | yes | 91 | + +## Method notes + +- Both systems receive the same 32 records in the same order. +- Both systems answer the same 18 queries with the same `top_k`. +- Accuracy is deterministic concept coverage, not an LLM judge. +- Strict accuracy requires full concept coverage and no stale-value match. +- Stale leak rate measures whether superseded values appear in retrieved context. +- Token counts use `cl100k_base` only as a fixed cross-system accounting unit. +- Latency excludes one warm-up pass and includes all configured repeated queries. +- Mem0 agentic LLM tokens are native Ollama `prompt_eval_count` and `eval_count`. diff --git a/examples/benchmarks/temporal-memory-showdown/run_benchmark.py b/examples/benchmarks/temporal-memory-showdown/run_benchmark.py new file mode 100644 index 00000000..b7dfa3d3 --- /dev/null +++ b/examples/benchmarks/temporal-memory-showdown/run_benchmark.py @@ -0,0 +1,420 @@ +"""Run a live, paired Memanto versus Mem0 temporal-memory benchmark.""" + +from __future__ import annotations + +import argparse +import json +import os +import platform +import subprocess +import sys +import time +from dataclasses import asdict +from datetime import datetime, timezone +from pathlib import Path +from statistics import mean +from typing import Any + +import psutil +from backends import ( + Mem0Backend, + MemantoBackend, + MemoryBackend, + new_run_id, + wait_until_searchable, +) +from dataset import QUERIES, RECORDS +from metrics import ( + count_tokens, + paired_bootstrap_delta, + percentile, + score_query, + summarize_scores, +) + +HERE = Path(__file__).resolve().parent + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Benchmark real Memanto and Mem0 memory backends." + ) + parser.add_argument( + "--backends", + default="memanto,mem0-direct,mem0-agentic", + help="Comma-separated: memanto, mem0-direct, mem0-agentic", + ) + parser.add_argument("--moorcheh-url", default="http://127.0.0.1:8080") + parser.add_argument("--ollama-url", default="http://127.0.0.1:11434") + parser.add_argument("--llm-model", default="qwen2.5:1.5b") + parser.add_argument("--top-k", type=int, default=5) + parser.add_argument("--repeats", type=int, default=5) + parser.add_argument("--ready-timeout", type=float, default=180.0) + parser.add_argument( + "--output", + type=Path, + default=HERE / "results" / "latest.json", + ) + parser.add_argument( + "--markdown", + type=Path, + default=HERE / "results" / "latest.md", + ) + return parser.parse_args() + + +def build_backend(name: str, args: argparse.Namespace, run_id: str) -> MemoryBackend: + if name == "memanto": + return MemantoBackend(args.moorcheh_url, run_id) + if name == "mem0-direct": + return Mem0Backend( + ollama_url=args.ollama_url, + llm_model=args.llm_model, + run_id=run_id, + infer=False, + work_dir=HERE / ".benchmark-data", + ) + if name == "mem0-agentic": + return Mem0Backend( + ollama_url=args.ollama_url, + llm_model=args.llm_model, + run_id=run_id, + infer=True, + work_dir=HERE / ".benchmark-data", + ) + raise ValueError(f"Unknown backend: {name}") + + +def run_backend( + backend: MemoryBackend, + *, + top_k: int, + repeats: int, + ready_timeout: float, +) -> dict[str, Any]: + process = psutil.Process() + rss_before = process.memory_info().rss + ingest_latencies: list[float] = [] + + for record in RECORDS: + started = time.perf_counter() + backend.ingest(record) + ingest_latencies.append(time.perf_counter() - started) + + ready_latency = wait_until_searchable( + backend, + query="What is the current mission call sign?", + expected="Lumen", + top_k=top_k, + timeout_s=ready_timeout, + ) + + for case in QUERIES: + backend.search(case.query, top_k) + + query_latencies: list[float] = [] + query_rows = [] + retrieved_tokens = 0 + scores = [] + for repeat in range(repeats): + for case in QUERIES: + started = time.perf_counter() + hits = backend.search(case.query, top_k) + query_latencies.append(time.perf_counter() - started) + if repeat != 0: + continue + + combined = "\n".join(hit.text for hit in hits) + tokens = count_tokens(combined) + retrieved_tokens += tokens + score = score_query(case, combined) + scores.append(score) + query_rows.append( + { + "query_id": case.query_id, + "category": case.category, + "query": case.query, + "retrieved_tokens": tokens, + "score": score.to_dict(), + "hits": [asdict(hit) for hit in hits], + } + ) + + rss_after = process.memory_info().rss + source_tokens = sum(count_tokens(record.text) for record in RECORDS) + usage = backend.usage() + summary = summarize_scores(scores) + return { + "backend": backend.name, + "summary": summary, + "metrics": { + "records_ingested": len(RECORDS), + "queries_evaluated": len(QUERIES), + "top_k": top_k, + "latency_repeats": repeats, + "source_tokens": source_tokens, + "retrieved_tokens": retrieved_tokens, + "avg_retrieved_tokens_per_query": round(retrieved_tokens / len(QUERIES), 3), + "ingest_total_s": round(sum(ingest_latencies), 6), + "ingest_p50_s": round(percentile(ingest_latencies, 0.50), 6), + "ingest_p95_s": round(percentile(ingest_latencies, 0.95), 6), + "index_ready_s": round(ready_latency, 6), + "query_mean_s": round(mean(query_latencies), 6), + "query_p50_s": round(percentile(query_latencies, 0.50), 6), + "query_p95_s": round(percentile(query_latencies, 0.95), 6), + "client_rss_delta_mb": round((rss_after - rss_before) / 1024**2, 3), + **usage, + }, + "scores": [score.to_dict() for score in scores], + "queries": query_rows, + } + + +def environment_snapshot() -> dict[str, Any]: + snapshot = { + "timestamp_utc": datetime.now(timezone.utc).isoformat(), + "python": sys.version, + "platform": platform.platform(), + "machine": platform.machine(), + "processor": platform.processor(), + "logical_cpus": os.cpu_count(), + "memory_gb": round(psutil.virtual_memory().total / 1024**3, 3), + "git_commit": _command_output(["git", "rev-parse", "HEAD"]), + "docker": _command_output( + ["docker", "version", "--format", "{{.Server.Version}}"] + ), + } + return snapshot + + +def _command_output(command: list[str]) -> str | None: + try: + result = subprocess.run( + command, + check=True, + capture_output=True, + text=True, + timeout=10, + ) + except (OSError, subprocess.SubprocessError): + return None + return result.stdout.strip() or None + + +def build_comparisons(runs: list[dict[str, Any]]) -> list[dict[str, Any]]: + if len(runs) < 2: + return [] + baseline = runs[0] + comparisons = [] + for challenger in runs[1:]: + baseline_scores = [ + score_query(case, _query_text(baseline, case.query_id)) for case in QUERIES + ] + challenger_scores = [ + score_query(case, _query_text(challenger, case.query_id)) + for case in QUERIES + ] + comparisons.append( + { + "baseline": baseline["backend"], + "challenger": challenger["backend"], + "coverage_delta": paired_bootstrap_delta( + baseline_scores, challenger_scores + ), + } + ) + return comparisons + + +def _query_text(run: dict[str, Any], query_id: str) -> str: + row = next(row for row in run["queries"] if row["query_id"] == query_id) + return "\n".join(hit["text"] for hit in row["hits"]) + + +def render_markdown(report: dict[str, Any]) -> str: + lines = [ + "# Temporal Memory Showdown Results", + "", + f"Generated: `{report['environment']['timestamp_utc']}`", + "", + ] + + runs_by_name = {run["backend"]: run for run in report["runs"]} + memanto = runs_by_name.get("memanto-on-prem") + mem0_agentic = runs_by_name.get("mem0-agentic") + if memanto and mem0_agentic: + memanto_metrics = memanto["metrics"] + mem0_metrics = mem0_agentic["metrics"] + comparison = next( + ( + row + for row in report.get("comparisons", []) + if row["baseline"] == "memanto-on-prem" + and row["challenger"] == "mem0-agentic" + ), + None, + ) + lines.extend( + [ + "## Primary showdown", + "", + "The primary comparison is Memanto against Mem0's default " + "agentic (`infer=True`) pipeline. `mem0-direct` is retained " + "as a vector-only ablation.", + "", + ] + ) + if comparison: + delta = comparison["coverage_delta"] + advantage = -delta["observed_delta"] + lower = -delta["ci95"][1] + upper = -delta["ci95"][0] + lines.append( + f"- Memanto coverage advantage: " + f"**{advantage * 100:+.1f} percentage points** " + f"(paired bootstrap 95% CI {lower * 100:+.1f} to " + f"{upper * 100:+.1f} points)." + ) + ingest_speedup = ( + mem0_metrics["ingest_total_s"] / memanto_metrics["ingest_total_s"] + ) + query_reduction = 1 - ( + memanto_metrics["query_p95_s"] / mem0_metrics["query_p95_s"] + ) + mem0_llm_tokens = ( + mem0_metrics["llm_input_tokens"] + mem0_metrics["llm_output_tokens"] + ) + lines.extend( + [ + f"- Full ingestion was **{ingest_speedup:,.1f}x faster** " + f"({memanto_metrics['ingest_total_s']:.3f}s vs " + f"{mem0_metrics['ingest_total_s']:.3f}s).", + f"- Query p95 was **{query_reduction:.1%} lower** " + f"({memanto_metrics['query_p95_s']:.4f}s vs " + f"{mem0_metrics['query_p95_s']:.4f}s).", + f"- Memanto used **0 extraction LLM tokens** vs " + f"**{mem0_llm_tokens:,}** native Ollama tokens.", + "- Stale-value leakage remains visible in both systems and is " + "reported rather than filtered from the audit.", + "", + ] + ) + + lines.extend( + [ + "## Headline metrics", + "", + "| Backend | Coverage | Strict accuracy | Stale leak rate | Retrieved tokens | Query p95 | Ingest p95 | LLM tokens |", + "| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |", + ] + ) + for run in report["runs"]: + summary = run["summary"] + metrics = run["metrics"] + llm_tokens = metrics["llm_input_tokens"] + metrics["llm_output_tokens"] + lines.append( + "| {backend} | {coverage:.1%} | {exact:.1%} | {stale:.1%} | " + "{retrieved} | {query_p95:.4f}s | {ingest_p95:.4f}s | {llm_tokens} |".format( + backend=run["backend"], + coverage=summary["mean_coverage"], + exact=summary["exact_accuracy"], + stale=summary["stale_leak_rate"], + retrieved=metrics["retrieved_tokens"], + query_p95=metrics["query_p95_s"], + ingest_p95=metrics["ingest_p95_s"], + llm_tokens=llm_tokens, + ) + ) + + lines.extend(["", "## Per-query audit", ""]) + for run in report["runs"]: + lines.extend( + [ + f"### {run['backend']}", + "", + "| Query | Category | Coverage | Stale leak | Retrieved tokens |", + "| --- | --- | ---: | ---: | ---: |", + ] + ) + for row in run["queries"]: + lines.append( + f"| {row['query_id']} | {row['category']} | " + f"{row['score']['coverage']:.1%} | " + f"{'yes' if row['score']['stale_leak'] else 'no'} | " + f"{row['retrieved_tokens']} |" + ) + lines.append("") + + lines.extend( + [ + "## Method notes", + "", + "- Both systems receive the same 32 records in the same order.", + "- Both systems answer the same 18 queries with the same `top_k`.", + "- Accuracy is deterministic concept coverage, not an LLM judge.", + "- Strict accuracy requires full concept coverage and no stale-value match.", + "- Stale leak rate measures whether superseded values appear in retrieved context.", + "- Token counts use `cl100k_base` only as a fixed cross-system accounting unit.", + "- Latency excludes one warm-up pass and includes all configured repeated queries.", + "- Mem0 agentic LLM tokens are native Ollama `prompt_eval_count` and `eval_count`.", + "", + ] + ) + return "\n".join(lines) + + +def main() -> int: + args = parse_args() + if args.top_k < 1 or args.repeats < 1: + raise ValueError("top-k and repeats must be positive") + selected = [name.strip() for name in args.backends.split(",") if name.strip()] + run_id = new_run_id() + runs = [] + + for name in selected: + backend = build_backend(name, args, run_id) + print(f"Running {backend.name}...", flush=True) + try: + runs.append( + run_backend( + backend, + top_k=args.top_k, + repeats=args.repeats, + ready_timeout=args.ready_timeout, + ) + ) + finally: + backend.close() + + report = { + "schema_version": 1, + "run_id": run_id, + "config": { + "backends": selected, + "top_k": args.top_k, + "repeats": args.repeats, + "llm_model": args.llm_model, + "moorcheh_url": args.moorcheh_url, + "ollama_url": args.ollama_url, + }, + "dataset": { + "records": len(RECORDS), + "queries": len(QUERIES), + "sessions": max(record.session for record in RECORDS), + }, + "environment": environment_snapshot(), + "runs": runs, + } + report["comparisons"] = build_comparisons(runs) + + args.output.parent.mkdir(parents=True, exist_ok=True) + args.markdown.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(json.dumps(report, indent=2), encoding="utf-8") + args.markdown.write_text(render_markdown(report), encoding="utf-8") + print(f"Wrote {args.output}") + print(f"Wrote {args.markdown}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/examples/benchmarks/temporal-memory-showdown/tests/conftest.py b/examples/benchmarks/temporal-memory-showdown/tests/conftest.py new file mode 100644 index 00000000..c06546d7 --- /dev/null +++ b/examples/benchmarks/temporal-memory-showdown/tests/conftest.py @@ -0,0 +1,7 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) diff --git a/examples/benchmarks/temporal-memory-showdown/tests/test_metrics.py b/examples/benchmarks/temporal-memory-showdown/tests/test_metrics.py new file mode 100644 index 00000000..be223d48 --- /dev/null +++ b/examples/benchmarks/temporal-memory-showdown/tests/test_metrics.py @@ -0,0 +1,156 @@ +from __future__ import annotations + +from pathlib import Path + +from backends import ( + MeteredOllamaClient, + SearchHit, + build_mem0_config, + create_memanto_agent, +) +from dataset import QUERIES, RECORDS, QueryCase +from metrics import paired_bootstrap_delta, percentile, score_query, summarize_scores +from run_benchmark import render_markdown, run_backend + + +def test_dataset_shape_and_ids_are_stable(): + assert len(RECORDS) == 32 + assert len(QUERIES) == 18 + assert len({record.record_id for record in RECORDS}) == len(RECORDS) + assert len({query.query_id for query in QUERIES}) == len(QUERIES) + + +def test_score_requires_all_concepts_and_penalizes_stale_context(): + case = QueryCase( + "T1", + "multi-hop", + "current state", + (("alpha",), ("beta", "bravo")), + (("legacy",),), + ) + clean = score_query(case, "Alpha and bravo are active.") + assert clean.coverage == 1.0 + assert clean.exact is True + assert clean.stale_leak is False + + stale = score_query(case, "Alpha and beta are active; legacy remains.") + assert stale.coverage == 1.0 + assert stale.exact is False + assert stale.stale_leak is True + + +def test_percentile_uses_nearest_rank(): + assert percentile([1, 2, 3, 4, 5], 0.95) == 5 + assert percentile([1, 2, 3, 4, 5], 0.50) == 3 + assert percentile([], 0.95) == 0.0 + + +def test_summary_groups_categories(): + current = score_query(QUERIES[0], "Dwarf radish is current.") + historical = score_query(QUERIES[11], "Genovese basil was first.") + summary = summarize_scores([current, historical]) + assert summary["mean_coverage"] == 1.0 + assert summary["exact_accuracy"] == 1.0 + assert set(summary["by_category"]) == {"current-state", "historical"} + + +def test_bootstrap_is_deterministic(): + baseline = [score_query(QUERIES[0], "basil") for _ in range(4)] + challenger = [score_query(QUERIES[0], "dwarf radish") for _ in range(4)] + result = paired_bootstrap_delta(baseline, challenger, samples=100, seed=123) + assert result["observed_delta"] == 1.0 + assert result["ci95"] == [1.0, 1.0] + + +def test_mem0_config_matches_installed_schema(tmp_path: Path): + from mem0.configs.base import MemoryConfig + + config = build_mem0_config( + ollama_url="http://127.0.0.1:11434", + llm_model="qwen2.5:1.5b", + run_id="test", + backend_name="mem0-direct", + work_dir=tmp_path, + ) + parsed = MemoryConfig(**config) + assert parsed.llm.provider == "ollama" + assert parsed.embedder.config["embedding_dims"] == 768 + assert parsed.vector_store.config.collection_name.startswith("temporal_test") + + +def test_ollama_meter_reads_native_token_counts(): + class FakeClient: + def chat(self, *args, **kwargs): + return {"prompt_eval_count": 17, "eval_count": 5, "message": {}} + + meter = MeteredOllamaClient(FakeClient()) + meter.chat(model="test", messages=[]) + assert meter.calls == 1 + assert meter.input_tokens == 17 + assert meter.output_tokens == 5 + + +def test_memanto_agent_bootstrap_retries_idempotently(): + class FlakyClient: + def __init__(self): + self.calls = 0 + + def create_agent(self, **kwargs): + self.calls += 1 + assert kwargs["agent_id"] == "bench-test" + if self.calls == 1: + raise RuntimeError("namespace committed before server returned 500") + + client = FlakyClient() + create_memanto_agent( + client, + agent_id="bench-test", + pattern="tool", + description="benchmark", + attempts=2, + delay_s=0, + ) + assert client.calls == 2 + + +def test_runner_produces_auditable_report_shape(): + class FakeBackend: + name = "fake" + + def __init__(self): + self.records = [] + + def ingest(self, record): + self.records.append(record) + + def search(self, query, top_k): + text = "\n".join(record.text for record in self.records) + return [SearchHit(text=text)] + + def usage(self): + return { + "llm_calls": 0, + "llm_input_tokens": 0, + "llm_output_tokens": 0, + } + + def close(self): + return None + + result = run_backend( + FakeBackend(), + top_k=5, + repeats=2, + ready_timeout=1.0, + ) + assert result["metrics"]["records_ingested"] == len(RECORDS) + assert result["metrics"]["queries_evaluated"] == len(QUERIES) + assert len(result["queries"]) == len(QUERIES) + + report = { + "environment": {"timestamp_utc": "2026-06-13T00:00:00+00:00"}, + "runs": [result], + } + markdown = render_markdown(report) + assert "Temporal Memory Showdown Results" in markdown + assert "| fake |" in markdown diff --git a/memanto/app/services/agent_service.py b/memanto/app/services/agent_service.py index 65d07a3b..bd145f90 100644 --- a/memanto/app/services/agent_service.py +++ b/memanto/app/services/agent_service.py @@ -79,10 +79,15 @@ def create_agent( # Namespace already exists - this is OK, agent might have been created before print(f"[OK] Namespace already exists in Moorcheh: {namespace}") except Exception as e: - # Unexpected error - fail the agent creation - raise Exception( - f"Failed to create namespace '{namespace}' in Moorcheh: {str(e)}" - ) + # The on-prem client uses MoorchehApiError rather than the cloud + # SDK's ConflictError for the same HTTP 409 response. + if getattr(e, "status_code", None) == 409: + print(f"[OK] Namespace already exists in Moorcheh: {namespace}") + else: + # Unexpected error - fail the agent creation + raise Exception( + f"Failed to create namespace '{namespace}' in Moorcheh: {str(e)}" + ) # Create agent metadata agent = AgentInfo( diff --git a/tests/test_unit.py b/tests/test_unit.py index 8df58d8d..95521ce2 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -174,6 +174,26 @@ def test_create_agent(self, agent_service): print(f" Agent ID: {agent.agent_id}") print(f" Namespace: {agent.namespace}") + def test_create_agent_accepts_on_prem_namespace_conflict( + self, agent_service, mock_moorcheh_client + ): + """On-prem HTTP 409 is equivalent to the cloud SDK conflict.""" + + class OnPremConflict(Exception): + status_code = 409 + + mock_moorcheh_client.namespaces.create.side_effect = OnPremConflict( + "Namespace already exists." + ) + agent = agent_service.create_agent( + AgentCreate(agent_id="existing-agent", pattern=AgentPattern.TOOL), + moorcheh_api_key="on-prem-local", + ) + + assert agent.agent_id == "existing-agent" + assert agent.namespace == "memanto_agent_existing-agent" + assert agent_service.agent_exists("existing-agent") + def test_list_agents(self, agent_service): """Test listing agents""" # Create multiple agents