Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
MOORCHEH_API_KEY=your_moorcheh_api_key_here
OPENAI_API_KEY=your_openai_api_key_here
Comment thread
coderabbitai[bot] marked this conversation as resolved.
GROQ_API_KEY=your_groq_api_key_here
58 changes: 58 additions & 0 deletions examples/benchmarks/memanto_vs_mem0_persona_shift/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Memanto vs Mem0 Benchmark Suite: Shifting Persona Test

This directory contains a reproducible benchmarking suite that pits **Memanto** against **Mem0** (an established agentic memory framework).

It specifically evaluates **Scenario B: The Shifting Persona & Temporal Tracking Test**. As agents interact over multiple sessions, user preferences can change drastically. This benchmark tests how effectively both frameworks isolate current preferences without bloating the active context window or increasing latency.

## Prerequisites

1. Python 3.9+
2. A free **Moorcheh API Key** (Get one at [moorcheh.ai](https://moorcheh.ai))
3. A **Groq API Key** (Used for Mem0's internal LLM calls and the LLM-as-a-judge)
4. An **OpenAI API Key** (Optional fallback if not using Groq)

## Setup Instructions

1. Clone this repository and navigate to this folder:
```bash
cd examples/benchmarks/memanto_vs_mem0_persona_shift
```

2. Create a virtual environment and install dependencies:
```bash
python -m venv venv
source venv/bin/activate # Or `venv\Scripts\activate` on Windows
pip install -r requirements.txt
```

3. Configure your API keys:
```bash
cp .env.example .env
```
Open the `.env` file and insert your `MOORCHEH_API_KEY`, `GROQ_API_KEY`, and optional `OPENAI_API_KEY`.

## Running the Benchmark

Simply run the benchmark script:

```bash
python benchmark.py
```

## Methodology

### Dataset (`dataset.py`)
A simulated 4-session user interaction where the user starts by hating romance and loving action, watches a romance and likes it, and finally burns out on action entirely. The final expected state requires the memory system to synthesize this timeline properly.

### Adapters (`memory_layers.py`)
Wrappers around Memanto and Mem0 to ensure they are fed identical data and evaluated fairly.

### LLM Judge (`judge.py`)
Uses `llama-3.3-70b-versatile` via Groq to grade the retrieved context (0-100) based on how well it isolated the user's current preferences from their outdated ones.

### Output
The script prints a clean table directly to the terminal using `rich`, displaying:
- **Total Tokens Ingested**
- **Tokens Retrieved**
- **p95 Latency (s)**
- **Accuracy Score (0-100)**
103 changes: 103 additions & 0 deletions examples/benchmarks/memanto_vs_mem0_persona_shift/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import os
import uuid
import numpy as np
from rich.console import Console
from rich.table import Table
from dotenv import load_dotenv

from dataset import SHIFTING_PERSONA_DATASET, EXPECTED_STATE
from memory_layers import MemantoLayer, Mem0Layer
from judge import LLMJudge

# Load environment variables
load_dotenv()

def run_evaluation(layer_name: str, layer, dataset: list, expected_state: str, judge: LLMJudge):
user_id = f"test_user_{uuid.uuid4().hex[:8]}"

latencies = []
total_tokens_ingested = 0

try:
print(f"[{layer_name}] Starting ingestion...")
# 1. Ingest Data
for msg in dataset[:-1]: # All except the last query
metrics = layer.add_memory(user_id=user_id, content=msg["content"])
latencies.append(metrics["latency"])
total_tokens_ingested += metrics["tokens"]

print(f"[{layer_name}] Ingestion complete. Retrieving memory...")

# 2. Retrieve Memory
query = dataset[-1]["content"]
retrieved_context, retrieve_metrics = layer.retrieve_memory(user_id=user_id, query=query)
latencies.append(retrieve_metrics["latency"])

# 3. Judge Accuracy
print(f"[{layer_name}] Judging retrieval accuracy...")
evaluation = judge.evaluate(expected_state=expected_state, retrieved_context=retrieved_context)

return {
"Layer": layer_name,
"Total Tokens Ingested": total_tokens_ingested,
"Tokens Retrieved": retrieve_metrics["tokens"],
"p95 Latency (s)": round(np.percentile(latencies, 95), 3),
"Accuracy Score": evaluation.get("score") if evaluation.get("score") is not None else "N/A",
"Judge Reasoning": evaluation.get("reasoning", "N/A"),
"Context Snippet": (retrieved_context[:100].encode('ascii', errors='ignore').decode('ascii') + "...") if len(retrieved_context) > 100 else retrieved_context.encode('ascii', errors='ignore').decode('ascii')
}
finally:
if hasattr(layer, 'cleanup'):
layer.cleanup(user_id)

def main():
console = Console()
console.print("[bold cyan]Starting Memanto vs Mem0 Benchmark (Scenario: Shifting Persona)[/bold cyan]")

judge = LLMJudge()

# Initialize layers
console.print("Initializing Memory Layers...")
memanto = MemantoLayer()
mem0 = Mem0Layer()

results = []

# Run Memanto
res_memanto = run_evaluation("Memanto", memanto, SHIFTING_PERSONA_DATASET, EXPECTED_STATE, judge)
results.append(res_memanto)

# Run Mem0
res_mem0 = run_evaluation("Mem0", mem0, SHIFTING_PERSONA_DATASET, EXPECTED_STATE, judge)
results.append(res_mem0)

# Output Table
table = Table(title="Benchmark Results: Accuracy vs. Resource Footprint")

table.add_column("Framework", justify="left", style="cyan", no_wrap=True)
table.add_column("Total Tokens Ingested", justify="right", style="magenta")
table.add_column("Tokens Retrieved", justify="right", style="magenta")
table.add_column("p95 Latency (s)", justify="right", style="green")
table.add_column("Accuracy Score", justify="right", style="yellow")

for r in results:
score_str = f"{r['Accuracy Score']}/100" if r['Accuracy Score'] != "N/A" else "N/A"
table.add_row(
r["Layer"],
str(r["Total Tokens Ingested"]),
str(r["Tokens Retrieved"]),
str(r["p95 Latency (s)"]),
score_str
)

console.print("\n")
console.print(table)

console.print("\n[bold]Judge Reasoning Notes:[/bold]")
for r in results:
reasoning_clean = r['Judge Reasoning'].encode('ascii', errors='ignore').decode('ascii')
console.print(f"- [bold cyan]{r['Layer']}[/bold cyan]: {reasoning_clean}")
console.print(f" [dim]Snippet: {r['Context Snippet']}[/dim]")

if __name__ == "__main__":
main()
26 changes: 26 additions & 0 deletions examples/benchmarks/memanto_vs_mem0_persona_shift/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# dataset.py

SHIFTING_PERSONA_DATASET = [
{
"session": 1,
"role": "user",
"content": "I am a huge fan of fast-paced action movies. I can't stand romance or slow dramas. They bore me to tears. Please remember this."
},
{
"session": 2,
"role": "user",
"content": "I actually went on a date last night and we watched a romantic comedy. Surprisingly, it wasn't terrible. I still prefer action, but maybe a rom-com once in a while is okay if it has good pacing."
},
{
"session": 3,
"role": "user",
"content": "Work has been really stressful lately. Honestly, I'm completely burnt out on loud action movies. I just want something relaxing right now. A slow, heartfelt romance or a deep drama sounds perfect."
},
{
"session": 4,
"role": "user",
"content": "Can you recommend a movie for me to watch tonight based on my current preferences?"
}
]

EXPECTED_STATE = "The user is currently burnt out on action movies due to stress and strongly prefers a relaxing, slow, heartfelt romance or deep drama for tonight."
50 changes: 50 additions & 0 deletions examples/benchmarks/memanto_vs_mem0_persona_shift/judge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os
import json
from groq import Groq

class LLMJudge:
def __init__(self):
self.client = Groq(api_key=os.getenv("GROQ_API_KEY"))
self.model = "llama-3.3-70b-versatile" # Using Groq's fast Llama 3 model
Comment thread
coderabbitai[bot] marked this conversation as resolved.

def evaluate(self, expected_state: str, retrieved_context: str) -> dict:
"""
Evaluates the retrieved_context against the expected_state.
Returns a dict with 'score' (0-100) and 'reasoning'.
"""
prompt = f"""
You are an expert AI evaluator judging the accuracy of a memory retrieval system.

The user has a dynamically shifting persona and preferences over time.

EXPECTED CURRENT STATE:
{expected_state}

RETRIEVED CONTEXT FROM MEMORY SYSTEM:
{retrieved_context}

Your task is to grade how accurately the RETRIEVED CONTEXT captures the EXPECTED CURRENT STATE.
A perfect score means the retrieved context clearly highlights the current preferences and downplays or correctly contextualizes outdated preferences.
A low score means the retrieved context is bloated with contradictory outdated information or misses the current state entirely.

Output your evaluation in strict JSON format:
{{
"score": <int between 0 and 100>,
"reasoning": "<brief explanation of the score>"
}}
"""

try:
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a JSON-outputting evaluator. Output only valid JSON without markdown wrapping."},
{"role": "user", "content": prompt}
],
response_format={ "type": "json_object" },
temperature=0.0
)
result = json.loads(response.choices[0].message.content)
return result
except Exception as e:
return {"score": None, "reasoning": f"Judge runtime failure: {str(e)}"}
125 changes: 125 additions & 0 deletions examples/benchmarks/memanto_vs_mem0_persona_shift/memory_layers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import os
import time
from abc import ABC, abstractmethod

class BaseMemoryLayer(ABC):
@abstractmethod
def add_memory(self, user_id: str, content: str) -> dict:
"""Add a memory and return metrics {"latency": float, "tokens": int}"""
pass

@abstractmethod
def retrieve_memory(self, user_id: str, query: str) -> tuple[str, dict]:
"""Retrieve memory context and return (context, metrics)"""
pass

class MemantoLayer(BaseMemoryLayer):
def __init__(self):
from moorcheh_sdk import MoorchehClient
api_key = os.getenv("MOORCHEH_API_KEY")
if not api_key:
raise ValueError("MOORCHEH_API_KEY environment variable is not set.")
self.client = MoorchehClient(api_key=api_key)
self.created_namespaces = set()

# Clean up any leftover test namespaces on startup
try:
ns_list = self.client.namespaces.list().get('namespaces', [])
for ns in ns_list:
name = ns.get('namespace_name', '')
if name.startswith('test_user_'):
self.client.namespaces.delete(namespace_name=name)
except Exception:
pass

# Simple token estimation for benchmark purposes if SDK doesn't provide it
import tiktoken
self.encoder = tiktoken.get_encoding("cl100k_base")

def _count_tokens(self, text: str) -> int:
return len(self.encoder.encode(text))

def add_memory(self, user_id: str, content: str) -> dict:
start_time = time.time()

import uuid
# Ensure the namespace is created before uploading
if user_id not in self.created_namespaces:
self.client.namespaces.create(namespace_name=user_id, type='text')
self.created_namespaces.add(user_id)

self.client.documents.upload(namespace_name=user_id, documents=[{"id": str(uuid.uuid4()), "text": content}])

latency = time.time() - start_time
return {"latency": latency, "tokens": self._count_tokens(content)}

def retrieve_memory(self, user_id: str, query: str) -> tuple[str, dict]:
start_time = time.time()

res = self.client.answer.generate(query=query, namespace=user_id)
if isinstance(res, dict):
context = res.get("answer")
else:
context = getattr(res, "answer", None)
if not isinstance(context, str):
raise RuntimeError(f"Unexpected moorcheh answer.generate response type: {type(res).__name__}")

latency = time.time() - start_time
return context, {"latency": latency, "tokens": self._count_tokens(context)}

def cleanup(self, user_id: str):
try:
self.client.namespaces.delete(namespace_name=user_id)
except Exception:
pass

class Mem0Layer(BaseMemoryLayer):
def __init__(self):
from mem0 import Memory
config = {
"llm": {
"provider": "groq",
"config": {
"model": "llama-3.3-70b-versatile",
"temperature": 0.0,
"max_tokens": 1500,
}
},
"embedder": {
"provider": "huggingface",
"config": {
"model": "sentence-transformers/all-MiniLM-L6-v2"
}
},
"vector_store": {
"provider": "qdrant",
"config": {
"collection_name": "mem0_hf",
"embedding_model_dims": 384
}
}
}
self.client = Memory.from_config(config)

import tiktoken
self.encoder = tiktoken.get_encoding("cl100k_base")

def _count_tokens(self, text: str) -> int:
return len(self.encoder.encode(text))

def add_memory(self, user_id: str, content: str) -> dict:
start_time = time.time()
self.client.add(content, user_id=user_id)
latency = time.time() - start_time
return {"latency": latency, "tokens": self._count_tokens(content)}

def retrieve_memory(self, user_id: str, query: str) -> tuple[str, dict]:
start_time = time.time()
results = self.client.search(query, filters={'user_id': user_id})

# Format Mem0 results into a single context string
memories = results.get("results", []) if isinstance(results, dict) else results
context = "\n".join([res.get("memory", "") for res in memories]) if memories else ""

latency = time.time() - start_time
return context, {"latency": latency, "tokens": self._count_tokens(context)}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
memanto
mem0ai
openai
python-dotenv
tiktoken
rich
groq
sentence-transformers