diff --git a/evaluation/README.md b/evaluation/README.md index f0bd166e1..47cfeedc0 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -22,17 +22,32 @@ This repository provides tools and scripts for evaluating the LoCoMo dataset usi 2. Copy the `configs-example/` directory to a new directory named `configs/`, and modify the configuration files inside it as needed. This directory contains model and API-specific settings. ## Setup MemOS +### local server ```bash -#start server +# modify {project_dir}/.env file and start server uvicorn memos.api.server_api:app --host 0.0.0.0 --port 8001 --workers 8 -# modify .env file +# configure {project_dir}/evaluation/.env file MEMOS_URL="http://127.0.0.1:8001" ``` +### online service +```bash +# get your api key at https://memos-dashboard.openmem.net/cn/quickstart/ +# configure {project_dir}/evaluation/.env file +MEMOS_KEY="Token mpg-xxxxx" +MEMOS_ONLINE_URL="https://memos.memtensor.cn/api/openmem/v1" + +``` + +## Supported frameworks +We support `memos-api` and `memos-api-online` in our scripts. +And give unofficial implementations for the following memory frameworks:`zep`, `mem0`, `memobase`, `supermemory`, `memu`. + + ## Evaluation Scripts ### LoCoMo Evaluation -⚙️ To evaluate the **LoCoMo** dataset using one of the supported memory frameworks — `memos`, `mem0`, or `zep` — run the following [script](./scripts/run_locomo_eval.sh): +⚙️ To evaluate the **LoCoMo** dataset using one of the supported memory frameworks — run the following [script](./scripts/run_locomo_eval.sh): ```bash # Edit the configuration in ./scripts/run_locomo_eval.sh @@ -53,7 +68,7 @@ First prepare the dataset `longmemeval_s` from https://huggingface.co/datasets/x ``` ### PrefEval Evaluation -To evaluate the **Prefeval** dataset using one of the supported memory frameworks — `memos`, `mem0`, or `zep` — run the following [script](./scripts/run_prefeval_eval.sh): +To evaluate the **Prefeval** dataset using one of the supported memory frameworks — run the following [script](./scripts/run_prefeval_eval.sh): ```bash # Edit the configuration in ./scripts/run_prefeval_eval.sh diff --git a/evaluation/scripts/locomo/locomo_eval.py b/evaluation/scripts/locomo/locomo_eval.py index f142fe130..b431e7768 100644 --- a/evaluation/scripts/locomo/locomo_eval.py +++ b/evaluation/scripts/locomo/locomo_eval.py @@ -363,7 +363,15 @@ async def limited_task(task): parser.add_argument( "--lib", type=str, - choices=["mem0", "mem0_graph", "openai", "memos-api", "memobase"], + choices=[ + "mem0", + "mem0_graph", + "memos-api", + "memos-api-online", + "memobase", + "memu", + "supermemory", + ], default="memos-api", ) parser.add_argument( diff --git a/evaluation/scripts/locomo/locomo_ingestion.py b/evaluation/scripts/locomo/locomo_ingestion.py index fe7aa86f7..518d90c4c 100644 --- a/evaluation/scripts/locomo/locomo_ingestion.py +++ b/evaluation/scripts/locomo/locomo_ingestion.py @@ -44,26 +44,33 @@ def ingest_session(client, session, frame, version, metadata): speaker_a_messages.append({"role": "assistant", "content": data}) speaker_b_messages.append({"role": "user", "content": data}) - if frame == "memos-api": + if "memos-api" in frame: for m in speaker_a_messages: m["chat_time"] = iso_date for m in speaker_b_messages: m["chat_time"] = iso_date - client.add(speaker_a_messages, speaker_a_user_id, f"{conv_id}_{metadata['session_key']}") - client.add(speaker_b_messages, speaker_b_user_id, f"{conv_id}_{metadata['session_key']}") + client.add( + speaker_a_messages, + speaker_a_user_id, + f"{conv_id}_{metadata['session_key']}", + batch_size=2, + ) + client.add( + speaker_b_messages, + speaker_b_user_id, + f"{conv_id}_{metadata['session_key']}", + batch_size=2, + ) elif "mem0" in frame: - for i in range(0, len(speaker_a_messages), 2): - batch_messages_a = speaker_a_messages[i : i + 2] - batch_messages_b = speaker_b_messages[i : i + 2] - client.add(batch_messages_a, speaker_a_user_id, timestamp) - client.add(batch_messages_b, speaker_b_user_id, timestamp) + client.add(speaker_a_messages, speaker_a_user_id, timestamp, batch_size=2) + client.add(speaker_b_messages, speaker_b_user_id, timestamp, batch_size=2) elif frame == "memobase": for m in speaker_a_messages: m["created_at"] = iso_date for m in speaker_b_messages: m["created_at"] = iso_date - client.add(speaker_a_messages, speaker_a_user_id) - client.add(speaker_b_messages, speaker_b_user_id) + client.add(speaker_a_messages, speaker_a_user_id, batch_size=2) + client.add(speaker_b_messages, speaker_b_user_id, batch_size=2) elif frame == "memu": client.add(speaker_a_messages, speaker_a_user_id, iso_date) client.add(speaker_b_messages, speaker_b_user_id, iso_date) @@ -103,6 +110,10 @@ def process_user(conv_idx, frame, locomo_df, version): from utils.client import MemosApiClient client = MemosApiClient() + elif frame == "memos-api-online": + from utils.client import MemosApiOnlineClient + + client = MemosApiOnlineClient() elif frame == "memobase": from utils.client import MemobaseClient @@ -187,7 +198,15 @@ def main(frame, version="default", num_workers=4): parser.add_argument( "--lib", type=str, - choices=["mem0", "mem0_graph", "memos-api", "memobase", "memu", "supermemory"], + choices=[ + "mem0", + "mem0_graph", + "memos-api", + "memos-api-online", + "memobase", + "memu", + "supermemory", + ], default="memos-api", ) parser.add_argument( diff --git a/evaluation/scripts/locomo/locomo_metric.py b/evaluation/scripts/locomo/locomo_metric.py index 6ddcdf127..e63888d45 100644 --- a/evaluation/scripts/locomo/locomo_metric.py +++ b/evaluation/scripts/locomo/locomo_metric.py @@ -9,7 +9,15 @@ parser.add_argument( "--lib", type=str, - choices=["mem0", "mem0_graph", "openai", "memos-api", "memobase"], + choices=[ + "mem0", + "mem0_graph", + "memos-api", + "memos-api-online", + "memobase", + "memu", + "supermemory", + ], default="memos-api", ) parser.add_argument( diff --git a/evaluation/scripts/locomo/locomo_responses.py b/evaluation/scripts/locomo/locomo_responses.py index 35a444b7d..6c082b31d 100644 --- a/evaluation/scripts/locomo/locomo_responses.py +++ b/evaluation/scripts/locomo/locomo_responses.py @@ -134,7 +134,15 @@ async def main(frame, version="default"): parser.add_argument( "--lib", type=str, - choices=["mem0", "mem0_graph", "openai", "memos-api", "memobase"], + choices=[ + "mem0", + "mem0_graph", + "memos-api", + "memos-api-online", + "memobase", + "memu", + "supermemory", + ], default="memos-api", ) parser.add_argument( diff --git a/evaluation/scripts/locomo/locomo_search.py b/evaluation/scripts/locomo/locomo_search.py index c629124dd..1ddf0d933 100644 --- a/evaluation/scripts/locomo/locomo_search.py +++ b/evaluation/scripts/locomo/locomo_search.py @@ -198,7 +198,7 @@ def search_query(client, query, metadata, frame, version, top_k=20): context, duration_ms = mem0_graph_search( client, query, speaker_a_user_id, speaker_b_user_id, top_k, speaker_a, speaker_b ) - elif frame == "memos-api": + elif "memos-api" in frame: context, duration_ms = memos_api_search( client, query, speaker_a_user_id, speaker_b_user_id, top_k, speaker_a, speaker_b ) @@ -257,6 +257,10 @@ def process_user(conv_idx, locomo_df, frame, version, top_k=20, num_workers=1): from utils.client import MemosApiClient client = MemosApiClient() + elif frame == "memos-api-online": + from utils.client import MemosApiOnlineClient + + client = MemosApiOnlineClient() elif frame == "memobase": from utils.client import MemobaseClient @@ -336,7 +340,15 @@ def main(frame, version="default", num_workers=1, top_k=20): parser.add_argument( "--lib", type=str, - choices=["mem0", "mem0_graph", "memos-api", "memobase", "memu", "supermemory"], + choices=[ + "mem0", + "mem0_graph", + "memos-api", + "memos-api-online", + "memobase", + "memu", + "supermemory", + ], default="memos-api", ) parser.add_argument( diff --git a/evaluation/scripts/longmemeval/lme_eval.py b/evaluation/scripts/longmemeval/lme_eval.py index 73117b925..20681ac2c 100644 --- a/evaluation/scripts/longmemeval/lme_eval.py +++ b/evaluation/scripts/longmemeval/lme_eval.py @@ -344,7 +344,15 @@ async def main(frame, version, nlp_options, num_runs=3, num_workers=5): parser.add_argument( "--lib", type=str, - choices=["mem0", "mem0_graph", "memos-api", "memobase", "memu", "supermemory"], + choices=[ + "mem0", + "mem0_graph", + "memos-api", + "memos-api-online", + "memobase", + "memu", + "supermemory", + ], default="memos-api", ) parser.add_argument( @@ -355,7 +363,7 @@ async def main(frame, version, nlp_options, num_runs=3, num_workers=5): type=str, nargs="+", default=["lexical"], - choices=["lexical", "semantic"], + choices=["lexical"], help="NLP options to use for evaluation.", ) parser.add_argument( diff --git a/evaluation/scripts/longmemeval/lme_ingestion.py b/evaluation/scripts/longmemeval/lme_ingestion.py index 325178292..e846a254c 100644 --- a/evaluation/scripts/longmemeval/lme_ingestion.py +++ b/evaluation/scripts/longmemeval/lme_ingestion.py @@ -18,7 +18,7 @@ def ingest_session(session, date, user_id, session_id, frame, client): if "mem0" in frame: for _idx, msg in enumerate(session): messages.append({"role": msg["role"], "content": msg["content"][:8000]}) - client.add(messages, user_id, int(date.timestamp())) + client.add(messages, user_id, int(date.timestamp()), batch_size=2) elif frame == "memobase": for _idx, msg in enumerate(session): messages.append( @@ -28,8 +28,8 @@ def ingest_session(session, date, user_id, session_id, frame, client): "created_at": date.isoformat(), } ) - client.add(messages, user_id) - elif frame == "memos-api": + client.add(messages, user_id, batch_size=2) + elif "memos-api" in frame: for msg in session: messages.append( { @@ -39,7 +39,7 @@ def ingest_session(session, date, user_id, session_id, frame, client): } ) if messages: - client.add(messages=messages, user_id=user_id, conv_id=session_id) + client.add(messages=messages, user_id=user_id, conv_id=session_id, batch_size=2) elif frame == "memu": for _idx, msg in enumerate(session): messages.append({"role": msg["role"], "content": msg["content"][:8000]}) @@ -80,6 +80,10 @@ def ingest_conv(lme_df, version, conv_idx, frame, success_records, f): from utils.client import MemosApiClient client = MemosApiClient() + elif frame == "memos-api-online": + from utils.client import MemosApiOnlineClient + + client = MemosApiOnlineClient() elif frame == "memobase": from utils.client import MemobaseClient @@ -167,7 +171,15 @@ def main(frame, version, num_workers=2): parser.add_argument( "--lib", type=str, - choices=["mem0", "mem0_graph", "memos-api", "memobase", "memu", "supermemory"], + choices=[ + "mem0", + "mem0_graph", + "memos-api", + "memos-api-online", + "memobase", + "memu", + "supermemory", + ], default="memos-api", ) parser.add_argument( diff --git a/evaluation/scripts/longmemeval/lme_metric.py b/evaluation/scripts/longmemeval/lme_metric.py index 93fa1de21..3664b47ba 100644 --- a/evaluation/scripts/longmemeval/lme_metric.py +++ b/evaluation/scripts/longmemeval/lme_metric.py @@ -258,7 +258,15 @@ def calculate_scores(data, grade_path, output_path): parser.add_argument( "--lib", type=str, - choices=["mem0", "mem0_graph", "memos-api", "memobase", "memu", "supermemory"], + choices=[ + "mem0", + "mem0_graph", + "memos-api", + "memos-api-online", + "memobase", + "memu", + "supermemory", + ], default="memos-api", ) parser.add_argument( diff --git a/evaluation/scripts/longmemeval/lme_responses.py b/evaluation/scripts/longmemeval/lme_responses.py index a4adf90b5..7d82358d6 100644 --- a/evaluation/scripts/longmemeval/lme_responses.py +++ b/evaluation/scripts/longmemeval/lme_responses.py @@ -132,7 +132,15 @@ def main(frame, version, num_workers=4): parser.add_argument( "--lib", type=str, - choices=["mem0", "mem0_graph", "memos-api", "memobase", "memu", "supermemory"], + choices=[ + "mem0", + "mem0_graph", + "memos-api", + "memos-api-online", + "memobase", + "memu", + "supermemory", + ], default="memos-api", ) parser.add_argument( diff --git a/evaluation/scripts/longmemeval/lme_search.py b/evaluation/scripts/longmemeval/lme_search.py index c02518083..60b2146f6 100644 --- a/evaluation/scripts/longmemeval/lme_search.py +++ b/evaluation/scripts/longmemeval/lme_search.py @@ -123,6 +123,11 @@ def process_user(lme_df, conv_idx, frame, version, top_k=20): client = MemosApiClient() context, duration_ms = memos_search(client, question, user_id, top_k) + elif frame == "memos-api-online": + from utils.client import MemosApiOnlineClient + + client = MemosApiOnlineClient() + context, duration_ms = memos_search(client, question, user_id, top_k) elif frame == "memu": from utils.client import MemuClient @@ -218,7 +223,15 @@ def main(frame, version, top_k=20, num_workers=2): parser.add_argument( "--lib", type=str, - choices=["mem0", "mem0_graph", "memos-api", "memobase", "memu", "supermemory"], + choices=[ + "mem0", + "mem0_graph", + "memos-api", + "memos-api-online", + "memobase", + "memu", + "supermemory", + ], default="memos-api", ) parser.add_argument( diff --git a/evaluation/scripts/utils/client.py b/evaluation/scripts/utils/client.py index 91d695acc..4117cba56 100644 --- a/evaluation/scripts/utils/client.py +++ b/evaluation/scripts/utils/client.py @@ -82,21 +82,13 @@ def add(self, messages, user_id, timestamp, batch_size=2): raise e def search(self, query, user_id, top_k): - if self.enable_graph: - res = self.client.search( - query=query, - top_k=top_k, - user_id=user_id, - enable_graph=True, - filters={"AND": [{"user_id": f"{user_id}"}]}, - ) - else: - res = self.client.search( - query=query, - top_k=top_k, - user_id=user_id, - filters={"AND": [{"user_id": f"{user_id}"}]}, - ) + res = self.client.search( + query=query, + top_k=top_k, + user_id=user_id, + enable_graph=self.enable_graph, + filters={"AND": [{"user_id": f"{user_id}"}]}, + ) return res @@ -155,23 +147,29 @@ def __init__(self): self.memos_url = os.getenv("MEMOS_URL") self.headers = {"Content-Type": "application/json", "Authorization": os.getenv("MEMOS_KEY")} - def add(self, messages, user_id, conv_id): + def add(self, messages, user_id, conv_id, batch_size: int = 9999): """ messages = [{"role": "assistant", "content": data, "chat_time": date_str}] """ url = f"{self.memos_url}/product/add" - payload = json.dumps( - { - "messages": messages, - "user_id": user_id, - "mem_cube_id": user_id, - "conversation_id": conv_id, - } - ) - response = requests.request("POST", url, data=payload, headers=self.headers) - assert response.status_code == 200, response.text - assert json.loads(response.text)["message"] == "Memory added successfully", response.text - return response.text + added_memories = [] + for i in range(0, len(messages), batch_size): + batch_messages = messages[i : i + batch_size] + payload = json.dumps( + { + "messages": batch_messages, + "user_id": user_id, + "mem_cube_id": user_id, + "conversation_id": conv_id, + } + ) + response = requests.request("POST", url, data=payload, headers=self.headers) + assert response.status_code == 200, response.text + assert json.loads(response.text)["message"] == "Memory added successfully", ( + response.text + ) + added_memories += json.loads(response.text)["data"] + return added_memories def search(self, query, user_id, top_k): """Search memories.""" @@ -200,28 +198,30 @@ def __init__(self): self.memos_url = os.getenv("MEMOS_ONLINE_URL") self.headers = {"Content-Type": "application/json", "Authorization": os.getenv("MEMOS_KEY")} - def add(self, messages, user_id, conv_id=None): + def add(self, messages, user_id, conv_id=None, batch_size: int = 9999): url = f"{self.memos_url}/add/message" - payload = json.dumps( - { - "messages": messages, - "user_id": user_id, - "conversation_id": conv_id, - } - ) + for i in range(0, len(messages), batch_size): + batch_messages = messages[i : i + batch_size] + payload = json.dumps( + { + "messages": batch_messages, + "user_id": user_id, + "conversation_id": conv_id, + } + ) - max_retries = 5 - for attempt in range(max_retries): - try: - response = requests.request("POST", url, data=payload, headers=self.headers) - assert response.status_code == 200, response.text - assert json.loads(response.text)["message"] == "ok", response.text - return response.text - except Exception as e: - if attempt < max_retries - 1: - time.sleep(2**attempt) - else: - raise e + max_retries = 5 + for attempt in range(max_retries): + try: + response = requests.request("POST", url, data=payload, headers=self.headers) + assert response.status_code == 200, response.text + assert json.loads(response.text)["message"] == "ok", response.text + break + except Exception as e: + if attempt < max_retries - 1: + time.sleep(2**attempt) + else: + raise e def search(self, query, user_id, top_k): """Search memories.""" @@ -244,7 +244,7 @@ def search(self, query, user_id, top_k): res = json.loads(response.text)["data"]["memory_detail_list"] for i in res: i.update({"memory": i.pop("memory_value")}) - return {"text_mem": [{"memories": res}]} + return {"text_mem": [{"memories": res}], "pref_mem": ""} except Exception as e: if attempt < max_retries - 1: time.sleep(2**attempt)