MemTensor · Nyakult · Jul 25, 2025 · Jul 25, 2025 · Jul 28, 2025 · Jul 28, 2025
diff --git a/evaluation/.env-example b/evaluation/.env-example
@@ -1,3 +1,4 @@
+# memory process model
 MODEL="gpt-4o-mini"
 OPENAI_API_KEY="sk-***REDACTED***"
 OPENAI_BASE_URL="http://***.***.***.***:3000/v1"
@@ -6,6 +7,13 @@ MEM0_API_KEY="m0-***REDACTED***"
 
 ZEP_API_KEY="z_***REDACTED***"
 
+# response model
 CHAT_MODEL="gpt-4o-mini"
 CHAT_MODEL_BASE_URL="http://***.***.***.***:3000/v1"
 CHAT_MODEL_API_KEY="sk-***REDACTED***"
+
+MEMOS_KEY="Token mpg-xxxxx"
+MEMOS_URL="https://apigw-pre.memtensor.cn/api/openmem/v1"
+
+MEMOBASE_API_KEY="xxxxx"
+MEMOBASE_PROJECT_URL="http://xxx.xxx.xxx.xxx:8019"
diff --git a/evaluation/README.md b/evaluation/README.md
@@ -34,3 +34,21 @@ This repository provides tools and scripts for evaluating the LoCoMo dataset usi
 ```
 
 ✍️ For evaluating OpenAI's native memory feature with the LoCoMo dataset, please refer to the detailed guide: [OpenAI Memory on LoCoMo - Evaluation Guide](./scripts/locomo/openai_memory_locomo_eval_guide.md).
+
+### LongMemEval Evaluation
+First prepare the dataset `longmemeval_s` from https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned
+, and save it as `data/longmemeval/longmemeval_s.json`
+
+```bash
+# Edit the configuration in ./scripts/run_lme_eval.sh
+# Specify the model and memory backend you want to use (e.g., mem0, zep, etc.)
+./scripts/run_lme_eval.sh
+```
+
+### prefEval Evaluation
+
+### personaMem Evaluation
+get `questions_32k.csv` and `shared_contexts_32k.jsonl` from https://huggingface.co/datasets/bowen-upenn/PersonaMem and save them at `data/personamem/`
+```bash
+./scripts/run_pm_eval.sh
+```
diff --git a/evaluation/scripts/PrefEval/pref_eval.py b/evaluation/scripts/PrefEval/pref_eval.py
@@ -0,0 +1,300 @@
+import asyncio
+import aiohttp
+import json
+import re
+import argparse
+from typing import Dict, Any
+from collections import Counter
+from tqdm.asyncio import tqdm
+import os
+
+API_KEY = os.getenv("OPENAI_API_KEY")
+API_URL = os.getenv("OPENAI_BASE_URL")
+
+INPUT_FILE = "./data/prefeval/pref_memos.jsonl"
+OUTPUT_FILE = "./data/prefeval/eval_pref_memos.jsonl"
+
+
+async def call_gpt4o_mini_async(session: aiohttp.ClientSession, prompt: str) -> str:
+    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {API_KEY}"}
+    payload = {
+        "model": "gpt-4o-mini",
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": 0,
+        "max_tokens": 500,
+    }
+
+    try:
+        async with session.post(API_URL, headers=headers, json=payload, timeout=30) as response:
+            response.raise_for_status()
+            data = await response.json()
+            return data["choices"][0]["message"]["content"]
+    except Exception as e:
+        print(f"API call failed: {str(e)}")
+        return ""
+
+
+def parse_xml_response(response: str, tag: str) -> str:
+    match = re.search(f"<{tag}>(.*?)</{tag}>", response, re.DOTALL)
+    return match.group(1).strip() if match else ""
+
+
+async def evaluate_violate_preference_async(
+    session: aiohttp.ClientSession, preference: str, question: str, response: str
+) -> Dict[str, str]:
+    prompt = f"""You will analyze a conversation between a user and an assistant, focusing on whether the assistant's response violates the user's stated preference.
+Evaluate the response based on these criteria:
+
+Answer "Yes" if:
+
+1. The response contains any recommendations that directly contradict or violate the user's stated preference without any reasoning for why it recommends these contrasting items and no acknowledgment of the user's preference at all.
+
+Answer "No" if:
+
+1. The response respects the user's stated preference.
+2. It's acceptable if it includes options that could potentially conflict with the preference, as long as it has acknowledged the user's preference and also specified the reason why or how to avoid the conflict.
+3. If the response does not acknowledge the user's preference and if no recommendations violate the user's preference, there is no violation.
+
+Here is the conversation:
+User's preference: {preference}
+User query: {question}
+Assistant response: {response}
+
+Examine the response meticulously and answer. Answer in this exact XML format:
+<explanation>[1 very short sentence explanation]</explanation>
+<answer>[Yes/No]</answer>"""
+
+    api_response = await call_gpt4o_mini_async(session, prompt)
+    return {
+        "explanation": parse_xml_response(api_response, "explanation"),
+        "answer": parse_xml_response(api_response, "answer"),
+    }
+
+
+async def evaluate_acknowledge_preference_async(
+    session: aiohttp.ClientSession, question: str, response: str
+) -> Dict[str, str]:
+    prompt = f"""You will analyze a conversation between a user and an assistant, focusing on whether the assistant acknowledges any user preference in answering the user's query. 
+
+Evaluate the response based on these stringent criteria:
+
+1. Check if the response explicitly or implicitly mentions or references a user preference. 2. The content of the preference is irrelevant for this check; only its presence matters.
+
+Answer "Yes" if:
+
+1. The response explicitly mentions or refers to a user preference in answering the user's question. Examples include: "Based on your previous preference for xxxx"; "Based on your previous preference, I would suggest you xxxx"; "Since you mentioned you prefer/dislike xxxx"; "Since you are a xxxx"; "I will recommend the following given your preference for xxx", etc. You should extract the "xxxx" in your answer. If it only says "Based on our previous conversation, I recommend..." and does not explicitly mention any preference, you should answer 'No'. 
+2. The response assumes the user preference in answering the user's question implicitly. For example, when the user asks 'Can you recommend me cars to drive?', if the response is 'Based on your preference, I will recommend non-EV cars, ...', then this indicates the assistant assumes that the user's preference is a dislike of EV cars, and you should answer 'Yes'.
+
+Answer "No" if the response does not mention or refer to any user preference explicitly or implicitly. If you cannot extract the sentence stating what the preference is, answer 'No'.
+
+Here is the assistant's response:
+The user's question is: {question}
+Assistant response: {response}
+
+Examine the response meticulously and answer. Please answer in this exact XML format without any additional text:
+<preference>[quote of the sentence that acknowledges/mentions what the preference is; leave it blank if there is none]</preference>
+<answer>[Yes/No]</answer>"""
+
+    api_response = await call_gpt4o_mini_async(session, prompt)
+    return {
+        "preference_mention": parse_xml_response(api_response, "preference"),
+        "answer": parse_xml_response(api_response, "answer"),
+    }
+
+
+async def evaluate_hallucinate_preference_async(
+    session: aiohttp.ClientSession, preference: str, restatement: str
+) -> Dict[str, str]:
+    if not restatement.strip():
+        return {"explanation": "No restatement provided by assistant", "answer": "No"}
+
+    prompt = f"""You will analyze a conversation between a user and an assistant, focusing on whether the assistant's restatement of the user's stated preference is the same preference. Evaluate the response based on these stringent criteria to answer if the assistant has hallucinated the preference or not:
+
+Answer "Yes" if:
+
+1. The assistant's restatement differs from the original preference in meaning or intent.
+2. The assistant's restatement contradicts the original preference.
+3. The assistant's restatement is not related or irrelevant to the original preference.
+
+Answer "No" if has not hallucinated:
+
+1. The assistant's restatement matches the original preference.
+2. The assistant's restatement is a minor paraphrase that fully preserves the meaning and intent of the original preference.
+3. The restatement is just empty, no hallucination.
+
+Here is the information: 
+Original user preference: "{preference}" 
+Assistant's restatement: "{restatement}" 
+Examine the original preference and the assistant's restatement meticulously and answer. Please answer in this exact XML format without any other additional text: 
+<explanation>[1 short sentence explanation]</explanation>
+<answer>[Yes/No]</answer>"""
+
+    api_response = await call_gpt4o_mini_async(session, prompt)
+    return {
+        "explanation": parse_xml_response(api_response, "explanation"),
+        "answer": parse_xml_response(api_response, "answer"),
+    }
+
+
+async def evaluate_helpful_response_async(
+    session: aiohttp.ClientSession, question: str, response: str
+) -> Dict[str, str]:
+    prompt = f"""You will analyze a conversation between a user and an assistant, focusing on whether the assistant provides any substantive response to the user's query.
+Evaluate the response based on these stringent criteria:
+
+Answer "Yes" if any of the following:
+
+1. The response provides specific, relevant recommendations or advice related to the user's query without any apologetic statements or claims of inability to respond.
+2. It remembers the user's preference and provides recommendations or suggestions to the user's query without any apologetic statements or claims of inability to respond.
+
+Answer "No" if any of the following:
+
+1. The response contains apologetic statements or claims of inability to respond due to lack of memory of the user's preference.
+2. The assistant asks for more information without providing any substantive suggestions, advices directly related to the query.
+3. The response doesn't address the user's query.
+4. Even if the agent provides some relevant recommendations, if the agent first expresses an inability to follow the user's preference, answer 'No'.
+
+Examples of unhelpful responses: (Answer: No):
+
+"Unfortunately, I don't actually have enough context about your specific preferences for xxx"
+"Unfortunately, we haven't had a previous discussion about your preferences for xxx. Could you let me know your preference for xxx?"
+"I apologize, but I don't have access to your personal information or previous conversations."
+"I'm sorry, but I can't provide a specific answer without more details."
+
+Here is the conversation:
+User query: {question}
+Assistant response: {response}
+
+Examine the response meticulously and answer. Answer in this exact XML format:
+<explanation>[1 very short sentence explanation]</explanation>
+<answer>[Yes/No]</answer>"""
+
+    api_response = await call_gpt4o_mini_async(session, prompt)
+    return {
+        "explanation": parse_xml_response(api_response, "explanation"),
+        "answer": parse_xml_response(api_response, "answer"),
+    }
+
+
+def classify_error_type(evaluation_results: Dict[str, Any]) -> str:
+    violate = evaluation_results["violate_preference"]["answer"]
+    acknowledge = evaluation_results["acknowledge_preference"]["answer"]
+    hallucinate = evaluation_results["hallucinate_preference"]["answer"]
+    helpful = evaluation_results["helpful_response"]["answer"]
+
+    if violate == "Yes" and acknowledge == "No" and helpful == "Yes":
+        return "Preference-Unaware Violation"
+    elif violate == "Yes" and acknowledge == "Yes" and hallucinate == "Yes" and helpful == "Yes":
+        return "Preference Hallucination Violation"
+    elif violate == "Yes" and acknowledge == "Yes" and hallucinate == "No" and helpful == "Yes":
+        return "Inconsistency Violation"
+    elif violate == "No" and helpful == "No":
+        return "Unhelpful Response"
+    else:
+        return "Unknown/No Error"
+
+
+async def process_line(
+    line: str, session: aiohttp.ClientSession, semaphore: asyncio.Semaphore
+) -> Dict[str, Any]:
+    async with semaphore:
+        data = json.loads(line.strip())
+        preference = data["preference"]
+        response = data["response"]
+        question = data["question"]
+        eval2 = await evaluate_acknowledge_preference_async(session, question, response)
+
+        tasks = [
+            evaluate_violate_preference_async(session, preference, question, response),
+            evaluate_hallucinate_preference_async(session, preference, eval2["preference_mention"]),
+            evaluate_helpful_response_async(session, question, response),
+        ]
+        eval1, eval3, eval4 = await asyncio.gather(*tasks)
+
+        evaluations = {
+            "violate_preference": eval1,
+            "acknowledge_preference": eval2,
+            "hallucinate_preference": eval3,
+            "helpful_response": eval4,
+        }
+
+        result = {
+            "original_data": data,
+            "evaluations": evaluations,
+            "error_type": classify_error_type(evaluations),
+        }
+        return result
+
+
+def log_summary(error_counter: Counter, total_samples: int) -> Dict[str, Dict[str, float]]:
+    summary_data = {}
+    print("\n--- Error Type Summary ---")
+
+    if total_samples == 0:
+        print("No samples were processed.")
+        print("--------------------------")
+        return summary_data
+
+    print(f"Total samples processed: {total_samples}")
+    sorted_errors = sorted(error_counter.items(), key=lambda item: item[1], reverse=True)
+
+    for error_type, count in sorted_errors:
+        percentage = (count / total_samples) * 100
+        summary_data[error_type] = {"count": count, "percentage": percentage}
+        print(f"- {error_type}: {count} ({percentage:.2f}%)")
+
+    print("--------------------------")
+    print("\nProcessing complete.")
+
+    return summary_data
+
+
+async def main(concurrency_limit: int):
+    semaphore = asyncio.Semaphore(concurrency_limit)
+    error_counter = Counter()
+
+    print(f"Starting evaluation with a concurrency limit of {concurrency_limit}...")
+
+    async with aiohttp.ClientSession() as session:
+        try:
+            with open(INPUT_FILE, "r", encoding="utf-8") as f:
+                lines = f.readlines()
+        except FileNotFoundError:
+            print(f"Error: Input file not found at '{INPUT_FILE}'")
+            return
+
+        tasks = [process_line(line, session, semaphore) for line in lines]
+
+        with open(OUTPUT_FILE, "w", encoding="utf-8") as outfile:
+            pbar = tqdm(
+                asyncio.as_completed(tasks),
+                total=len(tasks),
+                desc="Processing samples concurrently",
+                unit="sample",
+            )
+            for future in pbar:
+                try:
+                    result = await future
+                    outfile.write(json.dumps(result, ensure_ascii=False) + "\n")
+
+                    error_type = result["error_type"]
+                    error_counter[error_type] += 1
+                    pbar.set_postfix({"Latest Type": error_type})
+
+                except Exception as e:
+                    print(f"An error occurred while processing a line: {e}")
+
+    summary_results = log_summary(error_counter, len(lines))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Evaluate assistant responses from a JSONL file.")
+    parser.add_argument(
+        "--concurrency-limit",
+        type=int,
+        default=10,
+        help="The maximum number of concurrent API calls.",
+    )
+    args = parser.parse_args()
+
+    asyncio.run(main(concurrency_limit=args.concurrency_limit))