From 2e7d2d90621149a0aaa1a0491570d50b869415e1 Mon Sep 17 00:00:00 2001 From: Zaid Saifi Date: Tue, 16 Jun 2026 02:44:43 +0530 Subject: [PATCH 1/7] feat: Add Memanto vs Mem0 benchmarking suite for Shifting Persona scenario --- .../.env.example | 2 + .../memanto_vs_mem0_persona_shift/README.md | 57 +++++++++ .../benchmark.py | 96 ++++++++++++++ .../memanto_vs_mem0_persona_shift/dataset.py | 26 ++++ .../memanto_vs_mem0_persona_shift/judge.py | 50 ++++++++ .../memory_layers.py | 119 ++++++++++++++++++ .../requirements.txt | 8 ++ 7 files changed, 358 insertions(+) create mode 100644 examples/benchmarks/memanto_vs_mem0_persona_shift/.env.example create mode 100644 examples/benchmarks/memanto_vs_mem0_persona_shift/README.md create mode 100644 examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py create mode 100644 examples/benchmarks/memanto_vs_mem0_persona_shift/dataset.py create mode 100644 examples/benchmarks/memanto_vs_mem0_persona_shift/judge.py create mode 100644 examples/benchmarks/memanto_vs_mem0_persona_shift/memory_layers.py create mode 100644 examples/benchmarks/memanto_vs_mem0_persona_shift/requirements.txt diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/.env.example b/examples/benchmarks/memanto_vs_mem0_persona_shift/.env.example new file mode 100644 index 00000000..33d79b9f --- /dev/null +++ b/examples/benchmarks/memanto_vs_mem0_persona_shift/.env.example @@ -0,0 +1,2 @@ +MOORCHEH_API_KEY=your_moorcheh_api_key_here +OPENAI_API_KEY=your_openai_api_key_here diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/README.md b/examples/benchmarks/memanto_vs_mem0_persona_shift/README.md new file mode 100644 index 00000000..bdc617ac --- /dev/null +++ b/examples/benchmarks/memanto_vs_mem0_persona_shift/README.md @@ -0,0 +1,57 @@ +# Memanto vs Mem0 Benchmark Suite: Shifting Persona Test + +This directory contains a reproducible benchmarking suite that pits **Memanto** against **Mem0** (an established agentic memory framework). + +It specifically evaluates **Scenario B: The Shifting Persona & Temporal Tracking Test**. As agents interact over multiple sessions, user preferences can change drastically. This benchmark tests how effectively both frameworks isolate current preferences without bloating the active context window or increasing latency. + +## Prerequisites + +1. Python 3.9+ +2. A free **Moorcheh API Key** (Get one at [moorcheh.ai](https://moorcheh.ai)) +3. An **OpenAI API Key** (Used for Mem0's internal LLM calls and the LLM-as-a-judge) + +## Setup Instructions + +1. Clone this repository and navigate to this folder: + ```bash + cd examples/benchmarks/memanto_vs_mem0_persona_shift + ``` + +2. Create a virtual environment and install dependencies: + ```bash + python -m venv venv + source venv/bin/activate # Or `venv\Scripts\activate` on Windows + pip install -r requirements.txt + ``` + +3. Configure your API keys: + ```bash + cp .env.example .env + ``` + Open the `.env` file and insert your `MOORCHEH_API_KEY` and `OPENAI_API_KEY`. + +## Running the Benchmark + +Simply run the benchmark script: + +```bash +python benchmark.py +``` + +## Methodology + +### Dataset (`dataset.py`) +A simulated 4-session user interaction where the user starts by hating romance and loving action, watches a romance and likes it, and finally burns out on action entirely. The final expected state requires the memory system to synthesize this timeline properly. + +### Adapters (`memory_layers.py`) +Wrappers around Memanto and Mem0 to ensure they are fed identical data and evaluated fairly. + +### LLM Judge (`judge.py`) +Uses `gpt-4o` to grade the retrieved context (0-100) based on how well it isolated the user's current preferences from their outdated ones. + +### Output +The script prints a clean table directly to the terminal using `rich`, displaying: +- **Total Tokens Ingested** +- **Tokens Retrieved** +- **p95 Latency (s)** +- **Accuracy Score (0-100)** diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py b/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py new file mode 100644 index 00000000..e32057ff --- /dev/null +++ b/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py @@ -0,0 +1,96 @@ +import os +import uuid +import numpy as np +from rich.console import Console +from rich.table import Table +from dotenv import load_dotenv + +from dataset import SHIFTING_PERSONA_DATASET, EXPECTED_STATE +from memory_layers import MemantoLayer, Mem0Layer +from judge import LLMJudge + +# Load environment variables +load_dotenv() + +def run_evaluation(layer_name: str, layer, dataset: list, expected_state: str, judge: LLMJudge): + user_id = f"test_user_{uuid.uuid4().hex[:8]}" + + total_ingest_latency = 0.0 + total_tokens_ingested = 0 + + print(f"[{layer_name}] Starting ingestion...") + # 1. Ingest Data + for msg in dataset[:-1]: # All except the last query + metrics = layer.add_memory(user_id=user_id, content=msg["content"]) + total_ingest_latency += metrics["latency"] + total_tokens_ingested += metrics["tokens"] + + print(f"[{layer_name}] Ingestion complete. Retrieving memory...") + + # 2. Retrieve Memory + query = dataset[-1]["content"] + retrieved_context, retrieve_metrics = layer.retrieve_memory(user_id=user_id, query=query) + + # 3. Judge Accuracy + print(f"[{layer_name}] Judging retrieval accuracy...") + evaluation = judge.evaluate(expected_state=expected_state, retrieved_context=retrieved_context) + + return { + "Layer": layer_name, + "Total Tokens Ingested": total_tokens_ingested, + "Tokens Retrieved": retrieve_metrics["tokens"], + "p95 Latency (s)": round(np.percentile([total_ingest_latency, retrieve_metrics["latency"]], 95), 3), + "Accuracy Score": evaluation.get("score", 0), + "Judge Reasoning": evaluation.get("reasoning", "N/A"), + "Context Snippet": retrieved_context[:100] + "..." if len(retrieved_context) > 100 else retrieved_context + } + +def main(): + console = Console() + console.print("[bold cyan]Starting Memanto vs Mem0 Benchmark (Scenario: Shifting Persona)[/bold cyan]") + + judge = LLMJudge() + + # Initialize layers + console.print("Initializing Memory Layers...") + memanto = MemantoLayer() + mem0 = Mem0Layer() + + results = [] + + # Run Memanto + res_memanto = run_evaluation("Memanto", memanto, SHIFTING_PERSONA_DATASET, EXPECTED_STATE, judge) + results.append(res_memanto) + + # Run Mem0 + res_mem0 = run_evaluation("Mem0", mem0, SHIFTING_PERSONA_DATASET, EXPECTED_STATE, judge) + results.append(res_mem0) + + # Output Table + table = Table(title="Benchmark Results: Accuracy vs. Resource Footprint") + + table.add_column("Framework", justify="left", style="cyan", no_wrap=True) + table.add_column("Total Tokens Ingested", justify="right", style="magenta") + table.add_column("Tokens Retrieved", justify="right", style="magenta") + table.add_column("p95 Latency (s)", justify="right", style="green") + table.add_column("Accuracy Score", justify="right", style="yellow") + + for r in results: + table.add_row( + r["Layer"], + str(r["Total Tokens Ingested"]), + str(r["Tokens Retrieved"]), + str(r["p95 Latency (s)"]), + f"{r['Accuracy Score']}/100" + ) + + console.print("\n") + console.print(table) + + console.print("\n[bold]Judge Reasoning Notes:[/bold]") + for r in results: + console.print(f"- [bold cyan]{r['Layer']}[/bold cyan]: {r['Judge Reasoning']}") + console.print(f" [dim]Snippet: {r['Context Snippet']}[/dim]") + +if __name__ == "__main__": + main() diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/dataset.py b/examples/benchmarks/memanto_vs_mem0_persona_shift/dataset.py new file mode 100644 index 00000000..c57320d5 --- /dev/null +++ b/examples/benchmarks/memanto_vs_mem0_persona_shift/dataset.py @@ -0,0 +1,26 @@ +# dataset.py + +SHIFTING_PERSONA_DATASET = [ + { + "session": 1, + "role": "user", + "content": "I am a huge fan of fast-paced action movies. I can't stand romance or slow dramas. They bore me to tears. Please remember this." + }, + { + "session": 2, + "role": "user", + "content": "I actually went on a date last night and we watched a romantic comedy. Surprisingly, it wasn't terrible. I still prefer action, but maybe a rom-com once in a while is okay if it has good pacing." + }, + { + "session": 3, + "role": "user", + "content": "Work has been really stressful lately. Honestly, I'm completely burnt out on loud action movies. I just want something relaxing right now. A slow, heartfelt romance or a deep drama sounds perfect." + }, + { + "session": 4, + "role": "user", + "content": "Can you recommend a movie for me to watch tonight based on my current preferences?" + } +] + +EXPECTED_STATE = "The user is currently burnt out on action movies due to stress and strongly prefers a relaxing, slow, heartfelt romance or deep drama for tonight." diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/judge.py b/examples/benchmarks/memanto_vs_mem0_persona_shift/judge.py new file mode 100644 index 00000000..d933308e --- /dev/null +++ b/examples/benchmarks/memanto_vs_mem0_persona_shift/judge.py @@ -0,0 +1,50 @@ +import os +import json +from groq import Groq + +class LLMJudge: + def __init__(self): + self.client = Groq(api_key=os.getenv("GROQ_API_KEY")) + self.model = "llama-3.3-70b-versatile" # Using Groq's fast Llama 3 model + + def evaluate(self, expected_state: str, retrieved_context: str) -> dict: + """ + Evaluates the retrieved_context against the expected_state. + Returns a dict with 'score' (0-100) and 'reasoning'. + """ + prompt = f""" + You are an expert AI evaluator judging the accuracy of a memory retrieval system. + + The user has a dynamically shifting persona and preferences over time. + + EXPECTED CURRENT STATE: + {expected_state} + + RETRIEVED CONTEXT FROM MEMORY SYSTEM: + {retrieved_context} + + Your task is to grade how accurately the RETRIEVED CONTEXT captures the EXPECTED CURRENT STATE. + A perfect score means the retrieved context clearly highlights the current preferences and downplays or correctly contextualizes outdated preferences. + A low score means the retrieved context is bloated with contradictory outdated information or misses the current state entirely. + + Output your evaluation in strict JSON format: + {{ + "score": , + "reasoning": "" + }} + """ + + try: + response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": "You are a JSON-outputting evaluator. Output only valid JSON without markdown wrapping."}, + {"role": "user", "content": prompt} + ], + response_format={ "type": "json_object" }, + temperature=0.0 + ) + result = json.loads(response.choices[0].message.content) + return result + except Exception as e: + return {"score": 0, "reasoning": f"Judge failed: {str(e)}"} diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/memory_layers.py b/examples/benchmarks/memanto_vs_mem0_persona_shift/memory_layers.py new file mode 100644 index 00000000..a042e0de --- /dev/null +++ b/examples/benchmarks/memanto_vs_mem0_persona_shift/memory_layers.py @@ -0,0 +1,119 @@ +import os +import time +from abc import ABC, abstractmethod + +class BaseMemoryLayer(ABC): + @abstractmethod + def add_memory(self, user_id: str, content: str) -> dict: + """Add a memory and return metrics {"latency": float, "tokens": int}""" + pass + + @abstractmethod + def retrieve_memory(self, user_id: str, query: str) -> tuple[str, dict]: + """Retrieve memory context and return (context, metrics)""" + pass + +class MemantoLayer(BaseMemoryLayer): + def __init__(self): + self.mock_memory = [] + # We assume the Memanto client exposes `remember` and `recall` as advertised + try: + from moorcheh_sdk import MoorchehClient + self.client = MoorchehClient(api_key=os.getenv("MOORCHEH_API_KEY", "mock-key")) + self.has_real_client = True + except ImportError: + self.has_real_client = False + + # Simple token estimation for benchmark purposes if SDK doesn't provide it + import tiktoken + self.encoder = tiktoken.get_encoding("cl100k_base") + + def _count_tokens(self, text: str) -> int: + return len(self.encoder.encode(text)) + + def add_memory(self, user_id: str, content: str) -> dict: + start_time = time.time() + + import uuid + # Simulate Memanto's 'remember' core primitive + if self.has_real_client and hasattr(self.client, 'documents'): + try: + # Assuming Moorcheh SDK documents creation + self.client.documents.upload(namespace_name=user_id, documents=[{"id": str(uuid.uuid4()), "text": content}]) + except Exception as e: + print(f"Moorcheh upload failed: {e}") + self.mock_memory.append(content) + else: + self.mock_memory.append(content) + + latency = time.time() - start_time + return {"latency": latency, "tokens": self._count_tokens(content)} + + def retrieve_memory(self, user_id: str, query: str) -> tuple[str, dict]: + start_time = time.time() + + # Simulate Memanto's 'recall' core primitive + context = "" + if self.has_real_client and hasattr(self.client, 'answer'): + try: + res = self.client.answer.generate(query=query, namespace=user_id) + context = res.get('answer', '') if isinstance(res, dict) else getattr(res, 'answer', '') + except Exception as e: + print(f"Moorcheh answer failed: {e}") + context = "\n".join(self.mock_memory) + else: + context = "\n".join(self.mock_memory) + + latency = time.time() - start_time + return context, {"latency": latency, "tokens": self._count_tokens(context)} + +class Mem0Layer(BaseMemoryLayer): + def __init__(self): + from mem0 import Memory + config = { + "llm": { + "provider": "groq", + "config": { + "model": "llama-3.3-70b-versatile", + "temperature": 0.0, + "max_tokens": 1500, + } + }, + "embedder": { + "provider": "huggingface", + "config": { + "model": "sentence-transformers/all-MiniLM-L6-v2" + } + }, + "vector_store": { + "provider": "qdrant", + "config": { + "collection_name": "mem0_hf", + "embedding_model_dims": 384 + } + } + } + self.client = Memory.from_config(config) + + import tiktoken + self.encoder = tiktoken.get_encoding("cl100k_base") + + def _count_tokens(self, text: str) -> int: + return len(self.encoder.encode(text)) + + def add_memory(self, user_id: str, content: str) -> dict: + start_time = time.time() + self.client.add(content, user_id=user_id) + latency = time.time() - start_time + return {"latency": latency, "tokens": self._count_tokens(content)} + + def retrieve_memory(self, user_id: str, query: str) -> tuple[str, dict]: + start_time = time.time() + results = self.client.search(query, filters={'user_id': user_id}) + + # Format Mem0 results into a single context string + memories = results.get("results", []) if isinstance(results, dict) else results + context = "\n".join([res.get("memory", "") for res in memories]) if memories else "" + + latency = time.time() - start_time + return context, {"latency": latency, "tokens": self._count_tokens(context)} diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/requirements.txt b/examples/benchmarks/memanto_vs_mem0_persona_shift/requirements.txt new file mode 100644 index 00000000..c67eca62 --- /dev/null +++ b/examples/benchmarks/memanto_vs_mem0_persona_shift/requirements.txt @@ -0,0 +1,8 @@ +memanto +mem0ai +openai +python-dotenv +tiktoken +rich +groq +sentence-transformers From ae6bf168bb8a1e3f6946220d6b4a6f69db2deac0 Mon Sep 17 00:00:00 2001 From: Zaid Saifi Date: Tue, 16 Jun 2026 02:56:04 +0530 Subject: [PATCH 2/7] fix: Add GROQ_API_KEY to .env.example --- examples/benchmarks/memanto_vs_mem0_persona_shift/.env.example | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/.env.example b/examples/benchmarks/memanto_vs_mem0_persona_shift/.env.example index 33d79b9f..2f89123a 100644 --- a/examples/benchmarks/memanto_vs_mem0_persona_shift/.env.example +++ b/examples/benchmarks/memanto_vs_mem0_persona_shift/.env.example @@ -1,2 +1,3 @@ MOORCHEH_API_KEY=your_moorcheh_api_key_here OPENAI_API_KEY=your_openai_api_key_here +GROQ_API_KEY=your_groq_api_key_here From a953a6b32e845e1438bb714a4c37377aa5491090 Mon Sep 17 00:00:00 2001 From: Zaid Saifi Date: Tue, 16 Jun 2026 03:00:20 +0530 Subject: [PATCH 3/7] fix: Collect individual operation latencies to compute correct p95 percentile --- .../benchmarks/memanto_vs_mem0_persona_shift/benchmark.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py b/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py index e32057ff..991caeeb 100644 --- a/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py +++ b/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py @@ -15,14 +15,14 @@ def run_evaluation(layer_name: str, layer, dataset: list, expected_state: str, judge: LLMJudge): user_id = f"test_user_{uuid.uuid4().hex[:8]}" - total_ingest_latency = 0.0 + latencies = [] total_tokens_ingested = 0 print(f"[{layer_name}] Starting ingestion...") # 1. Ingest Data for msg in dataset[:-1]: # All except the last query metrics = layer.add_memory(user_id=user_id, content=msg["content"]) - total_ingest_latency += metrics["latency"] + latencies.append(metrics["latency"]) total_tokens_ingested += metrics["tokens"] print(f"[{layer_name}] Ingestion complete. Retrieving memory...") @@ -30,6 +30,7 @@ def run_evaluation(layer_name: str, layer, dataset: list, expected_state: str, j # 2. Retrieve Memory query = dataset[-1]["content"] retrieved_context, retrieve_metrics = layer.retrieve_memory(user_id=user_id, query=query) + latencies.append(retrieve_metrics["latency"]) # 3. Judge Accuracy print(f"[{layer_name}] Judging retrieval accuracy...") @@ -39,7 +40,7 @@ def run_evaluation(layer_name: str, layer, dataset: list, expected_state: str, j "Layer": layer_name, "Total Tokens Ingested": total_tokens_ingested, "Tokens Retrieved": retrieve_metrics["tokens"], - "p95 Latency (s)": round(np.percentile([total_ingest_latency, retrieve_metrics["latency"]], 95), 3), + "p95 Latency (s)": round(np.percentile(latencies, 95), 3), "Accuracy Score": evaluation.get("score", 0), "Judge Reasoning": evaluation.get("reasoning", "N/A"), "Context Snippet": retrieved_context[:100] + "..." if len(retrieved_context) > 100 else retrieved_context From 339eba54520417b55287519e12db6c38ad986281 Mon Sep 17 00:00:00 2001 From: Zaid Saifi Date: Tue, 16 Jun 2026 03:02:06 +0530 Subject: [PATCH 4/7] fix: Distinguish judge runtime failures from actual score evaluation --- .../benchmarks/memanto_vs_mem0_persona_shift/benchmark.py | 5 +++-- examples/benchmarks/memanto_vs_mem0_persona_shift/judge.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py b/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py index 991caeeb..6aeb144d 100644 --- a/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py +++ b/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py @@ -41,7 +41,7 @@ def run_evaluation(layer_name: str, layer, dataset: list, expected_state: str, j "Total Tokens Ingested": total_tokens_ingested, "Tokens Retrieved": retrieve_metrics["tokens"], "p95 Latency (s)": round(np.percentile(latencies, 95), 3), - "Accuracy Score": evaluation.get("score", 0), + "Accuracy Score": evaluation.get("score") if evaluation.get("score") is not None else "N/A", "Judge Reasoning": evaluation.get("reasoning", "N/A"), "Context Snippet": retrieved_context[:100] + "..." if len(retrieved_context) > 100 else retrieved_context } @@ -77,12 +77,13 @@ def main(): table.add_column("Accuracy Score", justify="right", style="yellow") for r in results: + score_str = f"{r['Accuracy Score']}/100" if r['Accuracy Score'] != "N/A" else "N/A" table.add_row( r["Layer"], str(r["Total Tokens Ingested"]), str(r["Tokens Retrieved"]), str(r["p95 Latency (s)"]), - f"{r['Accuracy Score']}/100" + score_str ) console.print("\n") diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/judge.py b/examples/benchmarks/memanto_vs_mem0_persona_shift/judge.py index d933308e..e63456a2 100644 --- a/examples/benchmarks/memanto_vs_mem0_persona_shift/judge.py +++ b/examples/benchmarks/memanto_vs_mem0_persona_shift/judge.py @@ -47,4 +47,4 @@ def evaluate(self, expected_state: str, retrieved_context: str) -> dict: result = json.loads(response.choices[0].message.content) return result except Exception as e: - return {"score": 0, "reasoning": f"Judge failed: {str(e)}"} + return {"score": None, "reasoning": f"Judge runtime failure: {str(e)}"} From d243827aeba3b9f1d8581ae66b274bd56d83ac77 Mon Sep 17 00:00:00 2001 From: Zaid Saifi Date: Tue, 16 Jun 2026 03:05:23 +0530 Subject: [PATCH 5/7] fix: Do not silently fall back to mock memory in benchmark mode and auto-create namespaces on demand --- .../memory_layers.py | 43 ++++++------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/memory_layers.py b/examples/benchmarks/memanto_vs_mem0_persona_shift/memory_layers.py index a042e0de..05de5c60 100644 --- a/examples/benchmarks/memanto_vs_mem0_persona_shift/memory_layers.py +++ b/examples/benchmarks/memanto_vs_mem0_persona_shift/memory_layers.py @@ -15,14 +15,12 @@ def retrieve_memory(self, user_id: str, query: str) -> tuple[str, dict]: class MemantoLayer(BaseMemoryLayer): def __init__(self): - self.mock_memory = [] - # We assume the Memanto client exposes `remember` and `recall` as advertised - try: - from moorcheh_sdk import MoorchehClient - self.client = MoorchehClient(api_key=os.getenv("MOORCHEH_API_KEY", "mock-key")) - self.has_real_client = True - except ImportError: - self.has_real_client = False + from moorcheh_sdk import MoorchehClient + api_key = os.getenv("MOORCHEH_API_KEY") + if not api_key: + raise ValueError("MOORCHEH_API_KEY environment variable is not set.") + self.client = MoorchehClient(api_key=api_key) + self.created_namespaces = set() # Simple token estimation for benchmark purposes if SDK doesn't provide it import tiktoken @@ -35,16 +33,12 @@ def add_memory(self, user_id: str, content: str) -> dict: start_time = time.time() import uuid - # Simulate Memanto's 'remember' core primitive - if self.has_real_client and hasattr(self.client, 'documents'): - try: - # Assuming Moorcheh SDK documents creation - self.client.documents.upload(namespace_name=user_id, documents=[{"id": str(uuid.uuid4()), "text": content}]) - except Exception as e: - print(f"Moorcheh upload failed: {e}") - self.mock_memory.append(content) - else: - self.mock_memory.append(content) + # Ensure the namespace is created before uploading + if user_id not in self.created_namespaces: + self.client.namespaces.create(namespace_name=user_id, type='text') + self.created_namespaces.add(user_id) + + self.client.documents.upload(namespace_name=user_id, documents=[{"id": str(uuid.uuid4()), "text": content}]) latency = time.time() - start_time return {"latency": latency, "tokens": self._count_tokens(content)} @@ -52,17 +46,8 @@ def add_memory(self, user_id: str, content: str) -> dict: def retrieve_memory(self, user_id: str, query: str) -> tuple[str, dict]: start_time = time.time() - # Simulate Memanto's 'recall' core primitive - context = "" - if self.has_real_client and hasattr(self.client, 'answer'): - try: - res = self.client.answer.generate(query=query, namespace=user_id) - context = res.get('answer', '') if isinstance(res, dict) else getattr(res, 'answer', '') - except Exception as e: - print(f"Moorcheh answer failed: {e}") - context = "\n".join(self.mock_memory) - else: - context = "\n".join(self.mock_memory) + res = self.client.answer.generate(query=query, namespace=user_id) + context = res.get('answer', '') if isinstance(res, dict) else getattr(res, 'answer', '') latency = time.time() - start_time return context, {"latency": latency, "tokens": self._count_tokens(context)} From 85da7d4feaa362083087f43a02a7026d64b43a84 Mon Sep 17 00:00:00 2001 From: Zaid Saifi Date: Tue, 16 Jun 2026 03:06:52 +0530 Subject: [PATCH 6/7] docs: Update README.md to match Groq judge configuration --- .../benchmarks/memanto_vs_mem0_persona_shift/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/README.md b/examples/benchmarks/memanto_vs_mem0_persona_shift/README.md index bdc617ac..ade86c57 100644 --- a/examples/benchmarks/memanto_vs_mem0_persona_shift/README.md +++ b/examples/benchmarks/memanto_vs_mem0_persona_shift/README.md @@ -8,7 +8,8 @@ It specifically evaluates **Scenario B: The Shifting Persona & Temporal Tracking 1. Python 3.9+ 2. A free **Moorcheh API Key** (Get one at [moorcheh.ai](https://moorcheh.ai)) -3. An **OpenAI API Key** (Used for Mem0's internal LLM calls and the LLM-as-a-judge) +3. A **Groq API Key** (Used for Mem0's internal LLM calls and the LLM-as-a-judge) +4. An **OpenAI API Key** (Optional fallback if not using Groq) ## Setup Instructions @@ -28,7 +29,7 @@ It specifically evaluates **Scenario B: The Shifting Persona & Temporal Tracking ```bash cp .env.example .env ``` - Open the `.env` file and insert your `MOORCHEH_API_KEY` and `OPENAI_API_KEY`. + Open the `.env` file and insert your `MOORCHEH_API_KEY`, `GROQ_API_KEY`, and optional `OPENAI_API_KEY`. ## Running the Benchmark @@ -47,7 +48,7 @@ A simulated 4-session user interaction where the user starts by hating romance a Wrappers around Memanto and Mem0 to ensure they are fed identical data and evaluated fairly. ### LLM Judge (`judge.py`) -Uses `gpt-4o` to grade the retrieved context (0-100) based on how well it isolated the user's current preferences from their outdated ones. +Uses `llama-3.3-70b-versatile` via Groq to grade the retrieved context (0-100) based on how well it isolated the user's current preferences from their outdated ones. ### Output The script prints a clean table directly to the terminal using `rich`, displaying: From a6bb7daa728dc63bbd4e940a15287fa7fb415369 Mon Sep 17 00:00:00 2001 From: Zaid Saifi Date: Tue, 16 Jun 2026 03:23:06 +0530 Subject: [PATCH 7/7] fix: Validate Moorcheh SDK answer response structure and handle Windows console print encodings --- .../benchmark.py | 57 ++++++++++--------- .../memory_layers.py | 23 +++++++- 2 files changed, 53 insertions(+), 27 deletions(-) diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py b/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py index 6aeb144d..86c44086 100644 --- a/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py +++ b/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py @@ -18,33 +18,37 @@ def run_evaluation(layer_name: str, layer, dataset: list, expected_state: str, j latencies = [] total_tokens_ingested = 0 - print(f"[{layer_name}] Starting ingestion...") - # 1. Ingest Data - for msg in dataset[:-1]: # All except the last query - metrics = layer.add_memory(user_id=user_id, content=msg["content"]) - latencies.append(metrics["latency"]) - total_tokens_ingested += metrics["tokens"] + try: + print(f"[{layer_name}] Starting ingestion...") + # 1. Ingest Data + for msg in dataset[:-1]: # All except the last query + metrics = layer.add_memory(user_id=user_id, content=msg["content"]) + latencies.append(metrics["latency"]) + total_tokens_ingested += metrics["tokens"] + + print(f"[{layer_name}] Ingestion complete. Retrieving memory...") + + # 2. Retrieve Memory + query = dataset[-1]["content"] + retrieved_context, retrieve_metrics = layer.retrieve_memory(user_id=user_id, query=query) + latencies.append(retrieve_metrics["latency"]) - print(f"[{layer_name}] Ingestion complete. Retrieving memory...") + # 3. Judge Accuracy + print(f"[{layer_name}] Judging retrieval accuracy...") + evaluation = judge.evaluate(expected_state=expected_state, retrieved_context=retrieved_context) - # 2. Retrieve Memory - query = dataset[-1]["content"] - retrieved_context, retrieve_metrics = layer.retrieve_memory(user_id=user_id, query=query) - latencies.append(retrieve_metrics["latency"]) - - # 3. Judge Accuracy - print(f"[{layer_name}] Judging retrieval accuracy...") - evaluation = judge.evaluate(expected_state=expected_state, retrieved_context=retrieved_context) - - return { - "Layer": layer_name, - "Total Tokens Ingested": total_tokens_ingested, - "Tokens Retrieved": retrieve_metrics["tokens"], - "p95 Latency (s)": round(np.percentile(latencies, 95), 3), - "Accuracy Score": evaluation.get("score") if evaluation.get("score") is not None else "N/A", - "Judge Reasoning": evaluation.get("reasoning", "N/A"), - "Context Snippet": retrieved_context[:100] + "..." if len(retrieved_context) > 100 else retrieved_context - } + return { + "Layer": layer_name, + "Total Tokens Ingested": total_tokens_ingested, + "Tokens Retrieved": retrieve_metrics["tokens"], + "p95 Latency (s)": round(np.percentile(latencies, 95), 3), + "Accuracy Score": evaluation.get("score") if evaluation.get("score") is not None else "N/A", + "Judge Reasoning": evaluation.get("reasoning", "N/A"), + "Context Snippet": (retrieved_context[:100].encode('ascii', errors='ignore').decode('ascii') + "...") if len(retrieved_context) > 100 else retrieved_context.encode('ascii', errors='ignore').decode('ascii') + } + finally: + if hasattr(layer, 'cleanup'): + layer.cleanup(user_id) def main(): console = Console() @@ -91,7 +95,8 @@ def main(): console.print("\n[bold]Judge Reasoning Notes:[/bold]") for r in results: - console.print(f"- [bold cyan]{r['Layer']}[/bold cyan]: {r['Judge Reasoning']}") + reasoning_clean = r['Judge Reasoning'].encode('ascii', errors='ignore').decode('ascii') + console.print(f"- [bold cyan]{r['Layer']}[/bold cyan]: {reasoning_clean}") console.print(f" [dim]Snippet: {r['Context Snippet']}[/dim]") if __name__ == "__main__": diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/memory_layers.py b/examples/benchmarks/memanto_vs_mem0_persona_shift/memory_layers.py index 05de5c60..4abd6d70 100644 --- a/examples/benchmarks/memanto_vs_mem0_persona_shift/memory_layers.py +++ b/examples/benchmarks/memanto_vs_mem0_persona_shift/memory_layers.py @@ -22,6 +22,16 @@ def __init__(self): self.client = MoorchehClient(api_key=api_key) self.created_namespaces = set() + # Clean up any leftover test namespaces on startup + try: + ns_list = self.client.namespaces.list().get('namespaces', []) + for ns in ns_list: + name = ns.get('namespace_name', '') + if name.startswith('test_user_'): + self.client.namespaces.delete(namespace_name=name) + except Exception: + pass + # Simple token estimation for benchmark purposes if SDK doesn't provide it import tiktoken self.encoder = tiktoken.get_encoding("cl100k_base") @@ -47,11 +57,22 @@ def retrieve_memory(self, user_id: str, query: str) -> tuple[str, dict]: start_time = time.time() res = self.client.answer.generate(query=query, namespace=user_id) - context = res.get('answer', '') if isinstance(res, dict) else getattr(res, 'answer', '') + if isinstance(res, dict): + context = res.get("answer") + else: + context = getattr(res, "answer", None) + if not isinstance(context, str): + raise RuntimeError(f"Unexpected moorcheh answer.generate response type: {type(res).__name__}") latency = time.time() - start_time return context, {"latency": latency, "tokens": self._count_tokens(context)} + def cleanup(self, user_id: str): + try: + self.client.namespaces.delete(namespace_name=user_id) + except Exception: + pass + class Mem0Layer(BaseMemoryLayer): def __init__(self): from mem0 import Memory