MemTensor · fridayL · Oct 24, 2025 · Jul 25, 2025 · Jul 25, 2025 · Jul 28, 2025
diff --git a/evaluation/.env-example b/evaluation/.env-example
@@ -3,21 +3,28 @@ MODEL="gpt-4o-mini"
 OPENAI_API_KEY="sk-***REDACTED***"
 OPENAI_BASE_URL="http://***.***.***.***:3000/v1"
 
-MEM0_API_KEY="m0-***REDACTED***"
-
-ZEP_API_KEY="z_***REDACTED***"
 
 # response model
 CHAT_MODEL="gpt-4o-mini"
 CHAT_MODEL_BASE_URL="http://***.***.***.***:3000/v1"
 CHAT_MODEL_API_KEY="sk-***REDACTED***"
 
+# memos
 MEMOS_KEY="Token mpg-xxxxx"
-MEMOS_URL="https://apigw-pre.memtensor.cn/api/openmem/v1"
-PRE_SPLIT_CHUNK=false  # pre split chunk in client end
+MEMOS_URL="http://127.0.0.1:8001"
+MEMOS_ONLINE_URL="https://memos.memtensor.cn/api/openmem/v1"
+
+# other memory agents
+MEM0_API_KEY="m0-xxx"
+ZEP_API_KEY="z_xxx"
+MEMU_API_KEY="mu_xxx"
+SUPERMEMORY_API_KEY="sm_xxx"
+MEMOBASE_API_KEY="xxx"
+MEMOBASE_PROJECT_URL="http://***.***.***.***:8019"
+
+# eval settings
+PRE_SPLIT_CHUNK=false
 
-MEMOBASE_API_KEY="xxxxx"
-MEMOBASE_PROJECT_URL="http://xxx.xxx.xxx.xxx:8019"
 
 # Configuration Only For Scheduler
 # RabbitMQ Configuration

diff --git a/evaluation/README.md b/evaluation/README.md
@@ -21,7 +21,14 @@ This repository provides tools and scripts for evaluating the LoCoMo dataset usi
 
 2. Copy the `configs-example/` directory to a new directory named `configs/`, and modify the configuration files inside it as needed. This directory contains model and API-specific settings.
 
+## Setup MemOS
+```bash
+#start server
+uvicorn memos.api.server_api:app --host 0.0.0.0 --port 8001 --workers 8
 
+# modify .env file
+MEMOS_URL="http://127.0.0.1:8001"
+```
 ## Evaluation Scripts
 
 ### LoCoMo Evaluation
@@ -45,10 +52,20 @@ First prepare the dataset `longmemeval_s` from https://huggingface.co/datasets/x
 ./scripts/run_lme_eval.sh
 ```
 
-### prefEval Evaluation
+### PrefEval Evaluation
+To evaluate the **Prefeval** dataset using one of the supported memory frameworks — `memos`, `mem0`, or `zep` — run the following [script](./scripts/run_prefeval_eval.sh):
 
-### personaMem Evaluation
+```bash
+# Edit the configuration in ./scripts/run_prefeval_eval.sh
+# Specify the model and memory backend you want to use (e.g., mem0, zep, etc.)
+./scripts/run_prefeval_eval.sh
+```
+
+### PersonaMem Evaluation
 get `questions_32k.csv` and `shared_contexts_32k.jsonl` from https://huggingface.co/datasets/bowen-upenn/PersonaMem and save them at `data/personamem/`
 ```bash
+# Edit the configuration in ./scripts/run_pm_eval.sh
+# Specify the model and memory backend you want to use (e.g., mem0, zep, etc.)
+# If you want to use MIRIX, edit the the configuration in ./scripts/personamem/config.yaml
 ./scripts/run_pm_eval.sh
 ```
diff --git a/evaluation/scripts/PrefEval/pref_eval.py b/evaluation/scripts/PrefEval/pref_eval.py
@@ -15,10 +15,6 @@
 API_KEY = os.getenv("OPENAI_API_KEY")
 API_URL = os.getenv("OPENAI_BASE_URL")
 
-INPUT_FILE = "./results/prefeval/pref_memos_process.jsonl"
-OUTPUT_FILE = "./results/prefeval/eval_pref_memos.jsonl"
-OUTPUT_EXCEL_FILE = "./results/prefeval/eval_pref_memos_summary.xlsx"
-
 
 async def call_gpt4o_mini_async(client: OpenAI, prompt: str) -> str:
     messages = [{"role": "user", "content": prompt}]
@@ -255,9 +251,10 @@ def generate_excel_summary(
     avg_search_time: float,
     avg_context_tokens: float,
     avg_add_time: float,
+    output_excel_file: str,
     model_name: str = "gpt-4o-mini",
 ):
-    print(f"Generating Excel summary at {OUTPUT_EXCEL_FILE}...")
+    print(f"Generating Excel summary at {output_excel_file}...")
 
     def get_pct(key):
         return summary_results.get(key, {}).get("percentage", 0)
@@ -282,7 +279,7 @@ def get_pct(key):
 
     df = pd.DataFrame(data)
 
-    with pd.ExcelWriter(OUTPUT_EXCEL_FILE, engine="xlsxwriter") as writer:
+    with pd.ExcelWriter(output_excel_file, engine="xlsxwriter") as writer:
         df.to_excel(writer, index=False, sheet_name="Summary")
 
         workbook = writer.book
@@ -300,10 +297,10 @@ def get_pct(key):
         bold_pct_format = workbook.add_format({"num_format": "0.0%", "bold": True})
         worksheet.set_column("F:F", 18, bold_pct_format)
 
-    print(f"Successfully saved summary to {OUTPUT_EXCEL_FILE}")
+    print(f"Successfully saved summary to {output_excel_file}")
 
 
-async def main(concurrency_limit: int):
+async def main(concurrency_limit: int, input_file: str, output_file: str, output_excel_file: str):
     semaphore = asyncio.Semaphore(concurrency_limit)
     error_counter = Counter()
 
@@ -313,17 +310,17 @@ async def main(concurrency_limit: int):
     total_add_time = 0
 
     print(f"Starting evaluation with a concurrency limit of {concurrency_limit}...")
-    print(f"Input file: {INPUT_FILE}")
-    print(f"Output JSONL: {OUTPUT_FILE}")
-    print(f"Output Excel: {OUTPUT_EXCEL_FILE}")
+    print(f"Input file: {input_file}")
+    print(f"Output JSONL: {output_file}")
+    print(f"Output Excel: {output_excel_file}")
 
     client = OpenAI(api_key=API_KEY, base_url=API_URL)
 
     try:
-        with open(INPUT_FILE, "r", encoding="utf-8") as f:
+        with open(input_file, "r", encoding="utf-8") as f:
             lines = f.readlines()
     except FileNotFoundError:
-        print(f"Error: Input file not found at '{INPUT_FILE}'")
+        print(f"Error: Input file not found at '{input_file}'")
         return
 
     if not lines:
@@ -332,7 +329,7 @@ async def main(concurrency_limit: int):
 
     tasks = [process_line(line, client, semaphore) for line in lines]
 
-    with open(OUTPUT_FILE, "w", encoding="utf-8") as outfile:
+    with open(output_file, "w", encoding="utf-8") as outfile:
         pbar = tqdm(
             asyncio.as_completed(tasks),
             total=len(tasks),
@@ -382,13 +379,19 @@ async def main(concurrency_limit: int):
             avg_search_time,
             avg_context_tokens,
             avg_add_time,
+            output_excel_file,
         )
     except Exception as e:
         print(f"\nFailed to generate Excel file: {e}")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Evaluate assistant responses from a JSONL file.")
+
+    parser.add_argument(
+        "--input", type=str, required=True, help="Path to the input JSONL file from pref_memos.py."
+    )
+
     parser.add_argument(
         "--concurrency-limit",
         type=int,
@@ -397,4 +400,17 @@ async def main(concurrency_limit: int):
     )
     args = parser.parse_args()
 
-    asyncio.run(main(concurrency_limit=args.concurrency_limit))
+    input_path = args.input
+    output_dir = os.path.dirname(input_path)
+
+    output_jsonl_path = os.path.join(output_dir, "eval_pref_memos.jsonl")
+    output_excel_path = os.path.join(output_dir, "eval_pref_memos_summary.xlsx")
+
+    asyncio.run(
+        main(
+            concurrency_limit=args.concurrency_limit,
+            input_file=input_path,
+            output_file=output_jsonl_path,
+            output_excel_file=output_excel_path,
+        )
+    )