valory-xyz · richardblythman · May 8, 2024 · May 8, 2024 · May 8, 2024
diff --git a/.env.example b/.env.example
@@ -2,4 +2,5 @@ OPENAI_API_KEY=
 ANTHROPIC_API_KEY=
 google_api_key=
 google_engine_id=
-NEWS_API_KEY=
+NEWS_API_KEY=
+SERPER_API_KEY=
diff --git a/benchmark/benchmark/run_benchmark.py b/benchmark/benchmark/run_benchmark.py
@@ -29,7 +29,6 @@
 from mech.packages.napthaai.customs.prediction_url_cot import (
     prediction_url_cot,
 )
-
 from mech.packages.napthaai.customs.prediction_request_rag_cohere import prediction_request_rag_cohere
 
 import time
@@ -48,13 +47,17 @@ def tool_map(tool):
     tool_dict = {
         "prediction-online": prediction_request,
         "prediction-offline": prediction_request,
+        "claude-prediction-offline": prediction_request,
         "prediction-online-summarized-info": prediction_request,
         "prediction-offline-sme": prediction_request_sme,
         "prediction-online-sme": prediction_request_sme,
         "prediction-request-rag": prediction_request_rag,
+        "prediction-request-rag-claude": prediction_request_rag,
         "prediction-request-rag-cohere": prediction_request_rag_cohere,
         "prediction-request-reasoning": prediction_request_reasoning,
+        "prediction-request-reasoning-claude": prediction_request_reasoning,
         "prediction-url-cot": prediction_url_cot,
+        "prediction-url-cot-claude": prediction_url_cot,
         "prediction-with-research-conservative": prediction_with_research_report,
         "prediction-with-research-bold": prediction_with_research_report,
     }
@@ -107,6 +110,8 @@ def parse_response(response, test_q):
     else:
         test_q["p_no"] = None
 
+    test_q["confidence"] = result["confidence"]
+
     if response[3] is not None:
         test_q["input_tokens"] = response[3].cost_dict["input_tokens"]
         test_q["output_tokens"] = response[3].cost_dict["output_tokens"]
@@ -188,6 +193,7 @@ def run_benchmark(kwargs):
             "model",
             "p_yes",
             "p_no",
+            "confidence",
             "prediction",
             "Correct",
             "input_tokens",
@@ -291,29 +297,32 @@ def run_benchmark(kwargs):
 
 if __name__ == "__main__":
     kwargs = {}
-    kwargs["num_questions"] = 10
+    # kwargs["num_questions"] = 2
     kwargs["tools"] = [
-        # "prediction-online",
+        "prediction-online",
         # "prediction-offline",
+        # "claude-prediction-online",
+        # "claude-prediction-offline",
         # "prediction-online-summarized-info",
         # "prediction-offline-sme",
         # "prediction-online-sme",
-        #"prediction-request-rag",
-        "prediction-request-rag-cohere",
+        # "prediction-request-rag",
+        # "prediction-request-rag-claude",
+        # "prediction-request-rag-cohere",
         # "prediction-request-reasoning",
-        #"prediction-url-cot",
+        # "prediction-request-reasoning-claude",
+        # "prediction-url-cot",
+        # "prediction-url-cot-claude",
         # "prediction-with-research-conservative",
         # "prediction-with-research-bold",
     ]
-    # kwargs["llm_provider"] = "anthropic"
-    kwargs["llm_provider"] = "openrouter"
     kwargs["model"] = [  # only supports running for one model (takes first in list)
         # "claude-3-haiku-20240307",
         # "claude-3-sonnet-20240229",
         # "claude-3-opus-20240229",
-        # "gpt-3.5-turbo-0125",
+        "gpt-3.5-turbo-0125",
         # "gpt-4-0125-preview",
-        "cohere/command-r-plus",
+        # "cohere/command-r-plus",
         # "databricks/dbrx-instruct:nitro"
         # "nousresearch/nous-hermes-2-mixtral-8x7b-sft"
     ]

diff --git a/benchmark/benchmark/run_marketclosing.py b/benchmark/benchmark/run_marketclosing.py
@@ -7,6 +7,7 @@
 from mech.packages.napthaai.customs.resolve_market_reasoning import (
     resolve_market_reasoning,
 )
+from mech.packages.kongzii.customs.ofv_market_resolver import ofv_market_resolver
 import os
 import openai
 import pandas as pd
@@ -31,6 +32,8 @@ def tool_map(tool):
         "resolve-market-reasoning-gpt-4",
     ]:
         return resolve_market_reasoning
+    elif tool in ["ofv-market-resolver"]:
+        return ofv_market_resolver
     else:
         raise Exception(f"Tool {tool} not found.")
 
@@ -70,16 +73,14 @@ def parse_response(response, test_q):
     else:
         test_q["prediction"] = None
 
-    test_q["reasoning"] = response[1].replace(os.linesep, "")
-    test_q["prompt_response"] = response[2].replace(os.linesep, "")
-    test_q["queries"] = response[3]
+    test_q["prompt_response"] = response[1].replace(os.linesep, "")
 
     if test_q["prediction"] in ("yes", "no"):
         test_q["Correct"] = test_q["prediction"] == test_q["answer"]
     else:
         test_q["Correct"] = None
 
-    if response[4] is not None:
+    if response[3] is not None:
         test_q["input_tokens"] = response[4].cost_dict["input_tokens"]
         test_q["output_tokens"] = response[4].cost_dict["output_tokens"]
         test_q["total_tokens"] = response[4].cost_dict["total_tokens"]
@@ -249,15 +250,17 @@ def run_benchmark(kwargs):
     kwargs["num_questions"] = 5
     kwargs["tools"] = [
         # "resolve-market",
-        "resolve-market-reasoning-gpt-3.5-turbo",
+        # "resolve-market-reasoning-gpt-3.5-turbo",
         # "resolve-market-reasoning-gpt-4",
+        "ofv-market-resolver",
     ]
     kwargs["api_keys"] = {}
     kwargs["api_keys"]["openai"] = os.getenv("OPENAI_API_KEY")
     kwargs["api_keys"]["anthropic"] = os.getenv("ANTHROPIC_API_KEY")
     kwargs["api_keys"]["google_api_key"] = os.getenv("google_api_key")
     kwargs["api_keys"]["google_engine_id"] = os.getenv("google_engine_id")
     kwargs["api_keys"]["newsapi"] = os.getenv("NEWS_API_KEY")
+    kwargs["api_keys"]["serperapi"] = os.getenv("SERPER_API_KEY")
     kwargs["num_urls"] = 3
     kwargs["num_words"] = 300
     run_benchmark(kwargs)
diff --git a/benchmark/benchmark/scripts/write_summary.py b/benchmark/benchmark/scripts/write_summary.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+from benchmark.utils import get_logger, TokenCounterCallback
+import os
+import pandas as pd
+from pathlib import Path
+
+logger = get_logger(__name__)
+
+def write_results(csv_file_path):
+
+    results_path = Path(csv_file_path.parent)
+    time_string = csv_file_path.stem.split("_", 1)[-1]
+
+    results_df = pd.read_csv(csv_file_path)
+    num_errors = results_df["error"].count()
+    logger.info(f"Num errors: {str(num_errors)}")
+    results_df = results_df.dropna(subset=["prediction"])
+    grouped_df = results_df.groupby(["tool", "model"]).agg(
+        {
+            "Correct": ["mean", "sum", "count"],
+            "crowd_correct": ["mean"],
+            "input_tokens": ["mean"],
+            "output_tokens": ["mean"],
+            "total_tokens": ["mean"],
+            "input_cost": ["mean"],
+            "output_cost": ["mean"],
+            "total_cost": ["mean"],
+        }
+    )
+
+    grouped_df.columns = ["_".join(col).strip() for col in grouped_df.columns.values]
+    summary_df = grouped_df.reset_index().rename(
+        columns={
+            "Correct_mean": "accuracy",
+            "Correct_sum": "correct",
+            "Correct_count": "total",
+            "crowd_correct_mean": "crowd_accuracy",
+        }
+    )
+
+    logger.info(f"Results:\n\n {results_df}")
+    summary_df.to_csv(results_path / f"summary_{time_string}.csv", index=False)
+
+if __name__ == "__main__":
+    print(os.getcwd())
+    results_path = Path("results")
+
+    csv_file_path = results_path / f"results_240418124558.csv" 
+
+    write_results(csv_file_path)