moorcheh-ai · zaid1234-11 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/.env.example b/examples/benchmarks/memanto_vs_mem0_persona_shift/.env.example
@@ -0,0 +1,3 @@
+MOORCHEH_API_KEY=your_moorcheh_api_key_here
+OPENAI_API_KEY=your_openai_api_key_here
+GROQ_API_KEY=your_groq_api_key_here
diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/README.md b/examples/benchmarks/memanto_vs_mem0_persona_shift/README.md
@@ -0,0 +1,58 @@
+# Memanto vs Mem0 Benchmark Suite: Shifting Persona Test
+
+This directory contains a reproducible benchmarking suite that pits **Memanto** against **Mem0** (an established agentic memory framework). 
+
+It specifically evaluates **Scenario B: The Shifting Persona & Temporal Tracking Test**. As agents interact over multiple sessions, user preferences can change drastically. This benchmark tests how effectively both frameworks isolate current preferences without bloating the active context window or increasing latency.
+
+## Prerequisites
+
+1. Python 3.9+
+2. A free **Moorcheh API Key** (Get one at [moorcheh.ai](https://moorcheh.ai))
+3. A **Groq API Key** (Used for Mem0's internal LLM calls and the LLM-as-a-judge)
+4. An **OpenAI API Key** (Optional fallback if not using Groq)
+
+## Setup Instructions
+
+1. Clone this repository and navigate to this folder:
+   ```bash
+   cd examples/benchmarks/memanto_vs_mem0_persona_shift
+   ```
+
+2. Create a virtual environment and install dependencies:
+   ```bash
+   python -m venv venv
+   source venv/bin/activate  # Or `venv\Scripts\activate` on Windows
+   pip install -r requirements.txt
+   ```
+
+3. Configure your API keys:
+   ```bash
+   cp .env.example .env
+   ```
+   Open the `.env` file and insert your `MOORCHEH_API_KEY`, `GROQ_API_KEY`, and optional `OPENAI_API_KEY`.
+
+## Running the Benchmark
+
+Simply run the benchmark script:
+
+```bash
+python benchmark.py
+```
+
+## Methodology
+
+### Dataset (`dataset.py`)
+A simulated 4-session user interaction where the user starts by hating romance and loving action, watches a romance and likes it, and finally burns out on action entirely. The final expected state requires the memory system to synthesize this timeline properly.
+
+### Adapters (`memory_layers.py`)
+Wrappers around Memanto and Mem0 to ensure they are fed identical data and evaluated fairly.
+
+### LLM Judge (`judge.py`)
+Uses `llama-3.3-70b-versatile` via Groq to grade the retrieved context (0-100) based on how well it isolated the user's current preferences from their outdated ones.
+
+### Output
+The script prints a clean table directly to the terminal using `rich`, displaying:
+- **Total Tokens Ingested**
+- **Tokens Retrieved**
+- **p95 Latency (s)**
+- **Accuracy Score (0-100)**
diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py b/examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py
@@ -0,0 +1,103 @@
+import os
+import uuid
+import numpy as np
+from rich.console import Console
+from rich.table import Table
+from dotenv import load_dotenv
+
+from dataset import SHIFTING_PERSONA_DATASET, EXPECTED_STATE
+from memory_layers import MemantoLayer, Mem0Layer
+from judge import LLMJudge
+
+# Load environment variables
+load_dotenv()
+
+def run_evaluation(layer_name: str, layer, dataset: list, expected_state: str, judge: LLMJudge):
+    user_id = f"test_user_{uuid.uuid4().hex[:8]}"
+
+    latencies = []
+    total_tokens_ingested = 0
+
+    try:
+        print(f"[{layer_name}] Starting ingestion...")
+        # 1. Ingest Data
+        for msg in dataset[:-1]: # All except the last query
+            metrics = layer.add_memory(user_id=user_id, content=msg["content"])
+            latencies.append(metrics["latency"])
+            total_tokens_ingested += metrics["tokens"]
+
+        print(f"[{layer_name}] Ingestion complete. Retrieving memory...")
+
+        # 2. Retrieve Memory
+        query = dataset[-1]["content"]
+        retrieved_context, retrieve_metrics = layer.retrieve_memory(user_id=user_id, query=query)
+        latencies.append(retrieve_metrics["latency"])
+
+        # 3. Judge Accuracy
+        print(f"[{layer_name}] Judging retrieval accuracy...")
+        evaluation = judge.evaluate(expected_state=expected_state, retrieved_context=retrieved_context)
+
+        return {
+            "Layer": layer_name,
+            "Total Tokens Ingested": total_tokens_ingested,
+            "Tokens Retrieved": retrieve_metrics["tokens"],
+            "p95 Latency (s)": round(np.percentile(latencies, 95), 3),
+            "Accuracy Score": evaluation.get("score") if evaluation.get("score") is not None else "N/A",
+            "Judge Reasoning": evaluation.get("reasoning", "N/A"),
+            "Context Snippet": (retrieved_context[:100].encode('ascii', errors='ignore').decode('ascii') + "...") if len(retrieved_context) > 100 else retrieved_context.encode('ascii', errors='ignore').decode('ascii')
+        }
+    finally:
+        if hasattr(layer, 'cleanup'):
+            layer.cleanup(user_id)
+
+def main():
+    console = Console()
+    console.print("[bold cyan]Starting Memanto vs Mem0 Benchmark (Scenario: Shifting Persona)[/bold cyan]")
+
+    judge = LLMJudge()
+
+    # Initialize layers
+    console.print("Initializing Memory Layers...")
+    memanto = MemantoLayer()
+    mem0 = Mem0Layer()
+
+    results = []
+
+    # Run Memanto
+    res_memanto = run_evaluation("Memanto", memanto, SHIFTING_PERSONA_DATASET, EXPECTED_STATE, judge)
+    results.append(res_memanto)
+
+    # Run Mem0
+    res_mem0 = run_evaluation("Mem0", mem0, SHIFTING_PERSONA_DATASET, EXPECTED_STATE, judge)
+    results.append(res_mem0)
+
+    # Output Table
+    table = Table(title="Benchmark Results: Accuracy vs. Resource Footprint")
+
+    table.add_column("Framework", justify="left", style="cyan", no_wrap=True)
+    table.add_column("Total Tokens Ingested", justify="right", style="magenta")
+    table.add_column("Tokens Retrieved", justify="right", style="magenta")
+    table.add_column("p95 Latency (s)", justify="right", style="green")
+    table.add_column("Accuracy Score", justify="right", style="yellow")
+
+    for r in results:
+        score_str = f"{r['Accuracy Score']}/100" if r['Accuracy Score'] != "N/A" else "N/A"
+        table.add_row(
+            r["Layer"],
+            str(r["Total Tokens Ingested"]),
+            str(r["Tokens Retrieved"]),
+            str(r["p95 Latency (s)"]),
+            score_str
+        )
+
+    console.print("\n")
+    console.print(table)
+
+    console.print("\n[bold]Judge Reasoning Notes:[/bold]")
+    for r in results:
+        reasoning_clean = r['Judge Reasoning'].encode('ascii', errors='ignore').decode('ascii')
+        console.print(f"- [bold cyan]{r['Layer']}[/bold cyan]: {reasoning_clean}")
+        console.print(f"  [dim]Snippet: {r['Context Snippet']}[/dim]")
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/dataset.py b/examples/benchmarks/memanto_vs_mem0_persona_shift/dataset.py
@@ -0,0 +1,26 @@
+# dataset.py
+
+SHIFTING_PERSONA_DATASET = [
+    {
+        "session": 1,
+        "role": "user",
+        "content": "I am a huge fan of fast-paced action movies. I can't stand romance or slow dramas. They bore me to tears. Please remember this."
+    },
+    {
+        "session": 2,
+        "role": "user",
+        "content": "I actually went on a date last night and we watched a romantic comedy. Surprisingly, it wasn't terrible. I still prefer action, but maybe a rom-com once in a while is okay if it has good pacing."
+    },
+    {
+        "session": 3,
+        "role": "user",
+        "content": "Work has been really stressful lately. Honestly, I'm completely burnt out on loud action movies. I just want something relaxing right now. A slow, heartfelt romance or a deep drama sounds perfect."
+    },
+    {
+        "session": 4,
+        "role": "user",
+        "content": "Can you recommend a movie for me to watch tonight based on my current preferences?"
+    }
+]
+
+EXPECTED_STATE = "The user is currently burnt out on action movies due to stress and strongly prefers a relaxing, slow, heartfelt romance or deep drama for tonight."
diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/judge.py b/examples/benchmarks/memanto_vs_mem0_persona_shift/judge.py
@@ -0,0 +1,50 @@
+import os
+import json
+from groq import Groq
+
+class LLMJudge:
+    def __init__(self):
+        self.client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+        self.model = "llama-3.3-70b-versatile"  # Using Groq's fast Llama 3 model
+
+    def evaluate(self, expected_state: str, retrieved_context: str) -> dict:
+        """
+        Evaluates the retrieved_context against the expected_state.
+        Returns a dict with 'score' (0-100) and 'reasoning'.
+        """
+        prompt = f"""
+        You are an expert AI evaluator judging the accuracy of a memory retrieval system.
+
+        The user has a dynamically shifting persona and preferences over time.
+
+        EXPECTED CURRENT STATE:
+        {expected_state}
+
+        RETRIEVED CONTEXT FROM MEMORY SYSTEM:
+        {retrieved_context}
+
+        Your task is to grade how accurately the RETRIEVED CONTEXT captures the EXPECTED CURRENT STATE.
+        A perfect score means the retrieved context clearly highlights the current preferences and downplays or correctly contextualizes outdated preferences.
+        A low score means the retrieved context is bloated with contradictory outdated information or misses the current state entirely.
+
+        Output your evaluation in strict JSON format:
+        {{
+            "score": <int between 0 and 100>,
+            "reasoning": "<brief explanation of the score>"
+        }}
+        """
+
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {"role": "system", "content": "You are a JSON-outputting evaluator. Output only valid JSON without markdown wrapping."},
+                    {"role": "user", "content": prompt}
+                ],
+                response_format={ "type": "json_object" },
+                temperature=0.0
+            )
+            result = json.loads(response.choices[0].message.content)
+            return result
+        except Exception as e:
+            return {"score": None, "reasoning": f"Judge runtime failure: {str(e)}"}
diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/memory_layers.py b/examples/benchmarks/memanto_vs_mem0_persona_shift/memory_layers.py
@@ -0,0 +1,125 @@
+import os
+import time
+from abc import ABC, abstractmethod
+
+class BaseMemoryLayer(ABC):
+    @abstractmethod
+    def add_memory(self, user_id: str, content: str) -> dict:
+        """Add a memory and return metrics {"latency": float, "tokens": int}"""
+        pass
+
+    @abstractmethod
+    def retrieve_memory(self, user_id: str, query: str) -> tuple[str, dict]:
+        """Retrieve memory context and return (context, metrics)"""
+        pass
+
+class MemantoLayer(BaseMemoryLayer):
+    def __init__(self):
+        from moorcheh_sdk import MoorchehClient
+        api_key = os.getenv("MOORCHEH_API_KEY")
+        if not api_key:
+            raise ValueError("MOORCHEH_API_KEY environment variable is not set.")
+        self.client = MoorchehClient(api_key=api_key)
+        self.created_namespaces = set()
+
+        # Clean up any leftover test namespaces on startup
+        try:
+            ns_list = self.client.namespaces.list().get('namespaces', [])
+            for ns in ns_list:
+                name = ns.get('namespace_name', '')
+                if name.startswith('test_user_'):
+                    self.client.namespaces.delete(namespace_name=name)
+        except Exception:
+            pass
+
+        # Simple token estimation for benchmark purposes if SDK doesn't provide it
+        import tiktoken
+        self.encoder = tiktoken.get_encoding("cl100k_base")
+
+    def _count_tokens(self, text: str) -> int:
+        return len(self.encoder.encode(text))
+
+    def add_memory(self, user_id: str, content: str) -> dict:
+        start_time = time.time()
+
+        import uuid
+        # Ensure the namespace is created before uploading
+        if user_id not in self.created_namespaces:
+            self.client.namespaces.create(namespace_name=user_id, type='text')
+            self.created_namespaces.add(user_id)
+
+        self.client.documents.upload(namespace_name=user_id, documents=[{"id": str(uuid.uuid4()), "text": content}])
+
+        latency = time.time() - start_time
+        return {"latency": latency, "tokens": self._count_tokens(content)}
+
+    def retrieve_memory(self, user_id: str, query: str) -> tuple[str, dict]:
+        start_time = time.time()
+
+        res = self.client.answer.generate(query=query, namespace=user_id)
+        if isinstance(res, dict):
+            context = res.get("answer")
+        else:
+            context = getattr(res, "answer", None)
+        if not isinstance(context, str):
+            raise RuntimeError(f"Unexpected moorcheh answer.generate response type: {type(res).__name__}")
+
+        latency = time.time() - start_time
+        return context, {"latency": latency, "tokens": self._count_tokens(context)}
+
+    def cleanup(self, user_id: str):
+        try:
+            self.client.namespaces.delete(namespace_name=user_id)
+        except Exception:
+            pass
+
+class Mem0Layer(BaseMemoryLayer):
+    def __init__(self):
+        from mem0 import Memory
+        config = {
+            "llm": {
+                "provider": "groq",
+                "config": {
+                    "model": "llama-3.3-70b-versatile",
+                    "temperature": 0.0,
+                    "max_tokens": 1500,
+                }
+            },
+            "embedder": {
+                "provider": "huggingface",
+                "config": {
+                    "model": "sentence-transformers/all-MiniLM-L6-v2"
+                }
+            },
+            "vector_store": {
+                "provider": "qdrant",
+                "config": {
+                    "collection_name": "mem0_hf",
+                    "embedding_model_dims": 384
+                }
+            }
+        }
+        self.client = Memory.from_config(config)
+
+        import tiktoken
+        self.encoder = tiktoken.get_encoding("cl100k_base")
+
+    def _count_tokens(self, text: str) -> int:
+        return len(self.encoder.encode(text))
+
+    def add_memory(self, user_id: str, content: str) -> dict:
+        start_time = time.time()
+        self.client.add(content, user_id=user_id)
+        latency = time.time() - start_time
+        return {"latency": latency, "tokens": self._count_tokens(content)}
+
+    def retrieve_memory(self, user_id: str, query: str) -> tuple[str, dict]:
+        start_time = time.time()
+        results = self.client.search(query, filters={'user_id': user_id})
+
+        # Format Mem0 results into a single context string
+        memories = results.get("results", []) if isinstance(results, dict) else results
+        context = "\n".join([res.get("memory", "") for res in memories]) if memories else ""
+
+        latency = time.time() - start_time
+        return context, {"latency": latency, "tokens": self._count_tokens(context)}
diff --git a/examples/benchmarks/memanto_vs_mem0_persona_shift/requirements.txt b/examples/benchmarks/memanto_vs_mem0_persona_shift/requirements.txt
@@ -0,0 +1,8 @@
+memanto
+mem0ai
+openai
+python-dotenv
+tiktoken
+rich
+groq
+sentence-transformers