MemTensor · Nyakult · Jul 25, 2025 · Jul 25, 2025 · Jul 28, 2025 · Jul 28, 2025
diff --git a/evaluation/.env-example b/evaluation/.env-example
@@ -1,3 +1,4 @@
+# memory process model
 MODEL="gpt-4o-mini"
 OPENAI_API_KEY="sk-***REDACTED***"
 OPENAI_BASE_URL="http://***.***.***.***:3000/v1"
@@ -6,6 +7,13 @@ MEM0_API_KEY="m0-***REDACTED***"
 
 ZEP_API_KEY="z_***REDACTED***"
 
+# response model
 CHAT_MODEL="gpt-4o-mini"
 CHAT_MODEL_BASE_URL="http://***.***.***.***:3000/v1"
 CHAT_MODEL_API_KEY="sk-***REDACTED***"
+
+MEMOS_KEY="Token mpg-xxxxx"
+MEMOS_URL="https://apigw-pre.memtensor.cn/api/openmem/v1"
+
+MEMOBASE_API_KEY="xxxxx"
+MEMOBASE_PROJECT_URL="http://xxx.xxx.xxx.xxx:8019"
diff --git a/evaluation/README.md b/evaluation/README.md
@@ -34,3 +34,17 @@ This repository provides tools and scripts for evaluating the LoCoMo dataset usi
 ```
 
 ✍️ For evaluating OpenAI's native memory feature with the LoCoMo dataset, please refer to the detailed guide: [OpenAI Memory on LoCoMo - Evaluation Guide](./scripts/locomo/openai_memory_locomo_eval_guide.md).
+
+### LongMemEval Evaluation
+First prepare the dataset `longmemeval_s` from https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned
+, and save it as `data/longmemeval/longmemeval_s.json`
+
+```bash
+# Edit the configuration in ./scripts/run_lme_eval.sh
+# Specify the model and memory backend you want to use (e.g., mem0, zep, etc.)
+./scripts/run_lme_eval.sh
+```
+
+### prefEval Evaluation
+
+### personaMem Evaluation
diff --git a/evaluation/configs-example/mirix_config.yaml b/evaluation/configs-example/mirix_config.yaml
@@ -0,0 +1,9 @@
+agent_name: mirix
+model_name: gpt-4o-mini
+model_endpoint: http://***.***.***.***:3000/v1
+api_key: sk-***REDACTED***
+embedding_model_name: text-embedding-3-small
+generation_config:
+  temperature: 0.8
+  max_tokens: 16192
+  context_window: 32768
diff --git a/evaluation/scripts/locomo/locomo_eval.py b/evaluation/scripts/locomo/locomo_eval.py
@@ -7,6 +7,7 @@
 
 import nltk
 import numpy as np
+import tiktoken
 import transformers
 
 from bert_score import score as bert_score
@@ -23,7 +24,7 @@
 
 logging.basicConfig(level=logging.CRITICAL)
 transformers.logging.set_verbosity_error()
-
+encoding = tiktoken.get_encoding("cl100k_base")
 # Download necessary NLTK resources
 try:
     nltk.download("wordnet", quiet=True)
@@ -173,7 +174,7 @@ def calculate_nlp_metrics(gold_answer, response, context, options=None):
     gold_answer = str(gold_answer) if gold_answer is not None else ""
     response = str(response) if response is not None else ""
 
-    metrics = {"context_tokens": len(nltk.word_tokenize(context)) if context else 0}
+    metrics = {"context_tokens": len(encoding.encode(context)) if context else 0}
 
     if "lexical" in options:
         gold_tokens = nltk.word_tokenize(gold_answer.lower())
@@ -363,11 +364,12 @@ async def limited_task(task):
         "--lib",
         type=str,
         choices=["zep", "memos", "mem0", "mem0_graph", "openai", "memos-api", "memobase"],
+        default="memos-api",
     )
     parser.add_argument(
         "--version",
         type=str,
-        default="default",
+        default="0917-test",
         help="Version identifier for loading results (e.g., 1010)",
     )
     parser.add_argument(
@@ -376,9 +378,9 @@ async def limited_task(task):
         default=3,
         help="Number of times to run the LLM grader for each question",
     )
-    parser.add_argument("--options", nargs="+", default=["lexical", "semantic"])
+    parser.add_argument("--options", nargs="+", default=[])
     parser.add_argument(
-        "--workers", type=int, default=4, help="Number of concurrent workers for processing groups"
+        "--workers", type=int, default=10, help="Number of concurrent workers for processing groups"
     )
     args = parser.parse_args()