Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
1220572
feat: check nodes existence
Nyakult Jul 25, 2025
1396919
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Jul 25, 2025
85b89bb
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Jul 28, 2025
c8c1488
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Jul 28, 2025
8baf5c6
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Jul 29, 2025
9982782
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Jul 30, 2025
f3dd6e7
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Jul 30, 2025
4471790
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Aug 1, 2025
0f9ccd4
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Aug 1, 2025
27196ef
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Aug 1, 2025
70d0a4a
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Aug 5, 2025
5dd9662
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Aug 6, 2025
27203e7
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Aug 6, 2025
b2cd7f0
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Aug 8, 2025
6fa6af7
feat: use different template for different language input
Nyakult Aug 8, 2025
b641c51
feat: use different template for different language input
Nyakult Aug 8, 2025
9f5aca1
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Aug 12, 2025
5eafce4
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Aug 13, 2025
5c2e637
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Aug 18, 2025
332bab6
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Aug 26, 2025
d3dca58
fix: eval script
Nyakult Aug 26, 2025
45cee24
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Aug 27, 2025
b1b448e
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Sep 1, 2025
ffb034e
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Sep 3, 2025
4551297
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Sep 4, 2025
204b545
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Sep 5, 2025
0706f8e
chore: bump version to v1.1.1 (#346)
fridayL Sep 24, 2025
298d155
Merge remote-tracking branch 'upstream/dev' into dev
Nyakult Sep 25, 2025
84d421d
feat: memos-api eval scripts
Nyakult Sep 25, 2025
6956cf0
Merge remote-tracking branch 'upstream/main' into eval/0925
Nyakult Sep 25, 2025
6a465f4
feat: mem reader
Nyakult Sep 25, 2025
bcfa7c9
feat: 实现äºprefeval memos-api evaluation scripts
2Rant Sep 25, 2025
eac7984
Merge pull request #2 from 2Rant/prefeval
Nyakult Sep 25, 2025
035d1a1
refactor:format code
Nyakult Sep 25, 2025
9c2ab81
feat: add PersonaMem eval scripts
Nyakult Sep 25, 2025
92b78c1
docs(evaluation): update PersonaMem eval readme
Nyakult Sep 25, 2025
82fecee
feat:memos-api ingest batch message
Nyakult Sep 25, 2025
d31a1fb
Merge remote-tracking branch 'upstream/dev' into eval/0925
Nyakult Oct 14, 2025
a312b80
format:ruff format
Nyakult Oct 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions evaluation/.env-example
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# memory process model
MODEL="gpt-4o-mini"
OPENAI_API_KEY="sk-***REDACTED***"
OPENAI_BASE_URL="http://***.***.***.***:3000/v1"
Expand All @@ -6,6 +7,13 @@ MEM0_API_KEY="m0-***REDACTED***"

ZEP_API_KEY="z_***REDACTED***"

# response model
CHAT_MODEL="gpt-4o-mini"
CHAT_MODEL_BASE_URL="http://***.***.***.***:3000/v1"
CHAT_MODEL_API_KEY="sk-***REDACTED***"

MEMOS_KEY="Token mpg-xxxxx"
MEMOS_URL="https://apigw-pre.memtensor.cn/api/openmem/v1"

MEMOBASE_API_KEY="xxxxx"
MEMOBASE_PROJECT_URL="http://xxx.xxx.xxx.xxx:8019"
18 changes: 18 additions & 0 deletions evaluation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,21 @@ This repository provides tools and scripts for evaluating the LoCoMo dataset usi
```

✍️ For evaluating OpenAI's native memory feature with the LoCoMo dataset, please refer to the detailed guide: [OpenAI Memory on LoCoMo - Evaluation Guide](./scripts/locomo/openai_memory_locomo_eval_guide.md).

### LongMemEval Evaluation
First prepare the dataset `longmemeval_s` from https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned
, and save it as `data/longmemeval/longmemeval_s.json`

```bash
# Edit the configuration in ./scripts/run_lme_eval.sh
# Specify the model and memory backend you want to use (e.g., mem0, zep, etc.)
./scripts/run_lme_eval.sh
```

### prefEval Evaluation

### personaMem Evaluation
get `questions_32k.csv` and `shared_contexts_32k.jsonl` from https://huggingface.co/datasets/bowen-upenn/PersonaMem and save them at `data/personamem/`
```bash
./scripts/run_pm_eval.sh
```
300 changes: 300 additions & 0 deletions evaluation/scripts/PrefEval/pref_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,300 @@
import asyncio
import aiohttp
import json
import re
import argparse
from typing import Dict, Any
from collections import Counter
from tqdm.asyncio import tqdm
import os

API_KEY = os.getenv("OPENAI_API_KEY")
API_URL = os.getenv("OPENAI_BASE_URL")

INPUT_FILE = "./data/prefeval/pref_memos.jsonl"
OUTPUT_FILE = "./data/prefeval/eval_pref_memos.jsonl"


async def call_gpt4o_mini_async(session: aiohttp.ClientSession, prompt: str) -> str:
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {API_KEY}"}
payload = {
"model": "gpt-4o-mini",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0,
"max_tokens": 500,
}

try:
async with session.post(API_URL, headers=headers, json=payload, timeout=30) as response:
response.raise_for_status()
data = await response.json()
return data["choices"][0]["message"]["content"]
except Exception as e:
print(f"API call failed: {str(e)}")
return ""


def parse_xml_response(response: str, tag: str) -> str:
match = re.search(f"<{tag}>(.*?)</{tag}>", response, re.DOTALL)
return match.group(1).strip() if match else ""


async def evaluate_violate_preference_async(
session: aiohttp.ClientSession, preference: str, question: str, response: str
) -> Dict[str, str]:
prompt = f"""You will analyze a conversation between a user and an assistant, focusing on whether the assistant's response violates the user's stated preference.
Evaluate the response based on these criteria:

Answer "Yes" if:

1. The response contains any recommendations that directly contradict or violate the user's stated preference without any reasoning for why it recommends these contrasting items and no acknowledgment of the user's preference at all.

Answer "No" if:

1. The response respects the user's stated preference.
2. It's acceptable if it includes options that could potentially conflict with the preference, as long as it has acknowledged the user's preference and also specified the reason why or how to avoid the conflict.
3. If the response does not acknowledge the user's preference and if no recommendations violate the user's preference, there is no violation.

Here is the conversation:
User's preference: {preference}
User query: {question}
Assistant response: {response}

Examine the response meticulously and answer. Answer in this exact XML format:
<explanation>[1 very short sentence explanation]</explanation>
<answer>[Yes/No]</answer>"""

api_response = await call_gpt4o_mini_async(session, prompt)
return {
"explanation": parse_xml_response(api_response, "explanation"),
"answer": parse_xml_response(api_response, "answer"),
}


async def evaluate_acknowledge_preference_async(
session: aiohttp.ClientSession, question: str, response: str
) -> Dict[str, str]:
prompt = f"""You will analyze a conversation between a user and an assistant, focusing on whether the assistant acknowledges any user preference in answering the user's query.

Evaluate the response based on these stringent criteria:

1. Check if the response explicitly or implicitly mentions or references a user preference. 2. The content of the preference is irrelevant for this check; only its presence matters.

Answer "Yes" if:

1. The response explicitly mentions or refers to a user preference in answering the user's question. Examples include: "Based on your previous preference for xxxx"; "Based on your previous preference, I would suggest you xxxx"; "Since you mentioned you prefer/dislike xxxx"; "Since you are a xxxx"; "I will recommend the following given your preference for xxx", etc. You should extract the "xxxx" in your answer. If it only says "Based on our previous conversation, I recommend..." and does not explicitly mention any preference, you should answer 'No'.
2. The response assumes the user preference in answering the user's question implicitly. For example, when the user asks 'Can you recommend me cars to drive?', if the response is 'Based on your preference, I will recommend non-EV cars, ...', then this indicates the assistant assumes that the user's preference is a dislike of EV cars, and you should answer 'Yes'.

Answer "No" if the response does not mention or refer to any user preference explicitly or implicitly. If you cannot extract the sentence stating what the preference is, answer 'No'.

Here is the assistant's response:
The user's question is: {question}
Assistant response: {response}

Examine the response meticulously and answer. Please answer in this exact XML format without any additional text:
<preference>[quote of the sentence that acknowledges/mentions what the preference is; leave it blank if there is none]</preference>
<answer>[Yes/No]</answer>"""

api_response = await call_gpt4o_mini_async(session, prompt)
return {
"preference_mention": parse_xml_response(api_response, "preference"),
"answer": parse_xml_response(api_response, "answer"),
}


async def evaluate_hallucinate_preference_async(
session: aiohttp.ClientSession, preference: str, restatement: str
) -> Dict[str, str]:
if not restatement.strip():
return {"explanation": "No restatement provided by assistant", "answer": "No"}

prompt = f"""You will analyze a conversation between a user and an assistant, focusing on whether the assistant's restatement of the user's stated preference is the same preference. Evaluate the response based on these stringent criteria to answer if the assistant has hallucinated the preference or not:

Answer "Yes" if:

1. The assistant's restatement differs from the original preference in meaning or intent.
2. The assistant's restatement contradicts the original preference.
3. The assistant's restatement is not related or irrelevant to the original preference.

Answer "No" if has not hallucinated:

1. The assistant's restatement matches the original preference.
2. The assistant's restatement is a minor paraphrase that fully preserves the meaning and intent of the original preference.
3. The restatement is just empty, no hallucination.

Here is the information:
Original user preference: "{preference}"
Assistant's restatement: "{restatement}"
Examine the original preference and the assistant's restatement meticulously and answer. Please answer in this exact XML format without any other additional text:
<explanation>[1 short sentence explanation]</explanation>
<answer>[Yes/No]</answer>"""

api_response = await call_gpt4o_mini_async(session, prompt)
return {
"explanation": parse_xml_response(api_response, "explanation"),
"answer": parse_xml_response(api_response, "answer"),
}


async def evaluate_helpful_response_async(
session: aiohttp.ClientSession, question: str, response: str
) -> Dict[str, str]:
prompt = f"""You will analyze a conversation between a user and an assistant, focusing on whether the assistant provides any substantive response to the user's query.
Evaluate the response based on these stringent criteria:

Answer "Yes" if any of the following:

1. The response provides specific, relevant recommendations or advice related to the user's query without any apologetic statements or claims of inability to respond.
2. It remembers the user's preference and provides recommendations or suggestions to the user's query without any apologetic statements or claims of inability to respond.

Answer "No" if any of the following:

1. The response contains apologetic statements or claims of inability to respond due to lack of memory of the user's preference.
2. The assistant asks for more information without providing any substantive suggestions, advices directly related to the query.
3. The response doesn't address the user's query.
4. Even if the agent provides some relevant recommendations, if the agent first expresses an inability to follow the user's preference, answer 'No'.

Examples of unhelpful responses: (Answer: No):

"Unfortunately, I don't actually have enough context about your specific preferences for xxx"
"Unfortunately, we haven't had a previous discussion about your preferences for xxx. Could you let me know your preference for xxx?"
"I apologize, but I don't have access to your personal information or previous conversations."
"I'm sorry, but I can't provide a specific answer without more details."

Here is the conversation:
User query: {question}
Assistant response: {response}

Examine the response meticulously and answer. Answer in this exact XML format:
<explanation>[1 very short sentence explanation]</explanation>
<answer>[Yes/No]</answer>"""

api_response = await call_gpt4o_mini_async(session, prompt)
return {
"explanation": parse_xml_response(api_response, "explanation"),
"answer": parse_xml_response(api_response, "answer"),
}


def classify_error_type(evaluation_results: Dict[str, Any]) -> str:
violate = evaluation_results["violate_preference"]["answer"]
acknowledge = evaluation_results["acknowledge_preference"]["answer"]
hallucinate = evaluation_results["hallucinate_preference"]["answer"]
helpful = evaluation_results["helpful_response"]["answer"]

if violate == "Yes" and acknowledge == "No" and helpful == "Yes":
return "Preference-Unaware Violation"
elif violate == "Yes" and acknowledge == "Yes" and hallucinate == "Yes" and helpful == "Yes":
return "Preference Hallucination Violation"
elif violate == "Yes" and acknowledge == "Yes" and hallucinate == "No" and helpful == "Yes":
return "Inconsistency Violation"
elif violate == "No" and helpful == "No":
return "Unhelpful Response"
else:
return "Unknown/No Error"


async def process_line(
line: str, session: aiohttp.ClientSession, semaphore: asyncio.Semaphore
) -> Dict[str, Any]:
async with semaphore:
data = json.loads(line.strip())
preference = data["preference"]
response = data["response"]
question = data["question"]
eval2 = await evaluate_acknowledge_preference_async(session, question, response)

tasks = [
evaluate_violate_preference_async(session, preference, question, response),
evaluate_hallucinate_preference_async(session, preference, eval2["preference_mention"]),
evaluate_helpful_response_async(session, question, response),
]
eval1, eval3, eval4 = await asyncio.gather(*tasks)

evaluations = {
"violate_preference": eval1,
"acknowledge_preference": eval2,
"hallucinate_preference": eval3,
"helpful_response": eval4,
}

result = {
"original_data": data,
"evaluations": evaluations,
"error_type": classify_error_type(evaluations),
}
return result


def log_summary(error_counter: Counter, total_samples: int) -> Dict[str, Dict[str, float]]:
summary_data = {}
print("\n--- Error Type Summary ---")

if total_samples == 0:
print("No samples were processed.")
print("--------------------------")
return summary_data

print(f"Total samples processed: {total_samples}")
sorted_errors = sorted(error_counter.items(), key=lambda item: item[1], reverse=True)

for error_type, count in sorted_errors:
percentage = (count / total_samples) * 100
summary_data[error_type] = {"count": count, "percentage": percentage}
print(f"- {error_type}: {count} ({percentage:.2f}%)")

print("--------------------------")
print("\nProcessing complete.")

return summary_data


async def main(concurrency_limit: int):
semaphore = asyncio.Semaphore(concurrency_limit)
error_counter = Counter()

print(f"Starting evaluation with a concurrency limit of {concurrency_limit}...")

async with aiohttp.ClientSession() as session:
try:
with open(INPUT_FILE, "r", encoding="utf-8") as f:
lines = f.readlines()
except FileNotFoundError:
print(f"Error: Input file not found at '{INPUT_FILE}'")
return

tasks = [process_line(line, session, semaphore) for line in lines]

with open(OUTPUT_FILE, "w", encoding="utf-8") as outfile:
pbar = tqdm(
asyncio.as_completed(tasks),
total=len(tasks),
desc="Processing samples concurrently",
unit="sample",
)
for future in pbar:
try:
result = await future
outfile.write(json.dumps(result, ensure_ascii=False) + "\n")

error_type = result["error_type"]
error_counter[error_type] += 1
pbar.set_postfix({"Latest Type": error_type})

except Exception as e:
print(f"An error occurred while processing a line: {e}")

summary_results = log_summary(error_counter, len(lines))


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Evaluate assistant responses from a JSONL file.")
parser.add_argument(
"--concurrency-limit",
type=int,
default=10,
help="The maximum number of concurrent API calls.",
)
args = parser.parse_args()

asyncio.run(main(concurrency_limit=args.concurrency_limit))
Loading
Loading