diff --git a/.env.example b/.env.example index afa0bb4..9e64db6 100644 --- a/.env.example +++ b/.env.example @@ -2,4 +2,5 @@ OPENAI_API_KEY= ANTHROPIC_API_KEY= google_api_key= google_engine_id= -NEWS_API_KEY= \ No newline at end of file +NEWS_API_KEY= +SERPER_API_KEY= \ No newline at end of file diff --git a/benchmark/benchmark/run_benchmark.py b/benchmark/benchmark/run_benchmark.py index 349afa1..a666038 100755 --- a/benchmark/benchmark/run_benchmark.py +++ b/benchmark/benchmark/run_benchmark.py @@ -29,7 +29,6 @@ from mech.packages.napthaai.customs.prediction_url_cot import ( prediction_url_cot, ) - from mech.packages.napthaai.customs.prediction_request_rag_cohere import prediction_request_rag_cohere import time @@ -48,13 +47,17 @@ def tool_map(tool): tool_dict = { "prediction-online": prediction_request, "prediction-offline": prediction_request, + "claude-prediction-offline": prediction_request, "prediction-online-summarized-info": prediction_request, "prediction-offline-sme": prediction_request_sme, "prediction-online-sme": prediction_request_sme, "prediction-request-rag": prediction_request_rag, + "prediction-request-rag-claude": prediction_request_rag, "prediction-request-rag-cohere": prediction_request_rag_cohere, "prediction-request-reasoning": prediction_request_reasoning, + "prediction-request-reasoning-claude": prediction_request_reasoning, "prediction-url-cot": prediction_url_cot, + "prediction-url-cot-claude": prediction_url_cot, "prediction-with-research-conservative": prediction_with_research_report, "prediction-with-research-bold": prediction_with_research_report, } @@ -107,6 +110,8 @@ def parse_response(response, test_q): else: test_q["p_no"] = None + test_q["confidence"] = result["confidence"] + if response[3] is not None: test_q["input_tokens"] = response[3].cost_dict["input_tokens"] test_q["output_tokens"] = response[3].cost_dict["output_tokens"] @@ -188,6 +193,7 @@ def run_benchmark(kwargs): "model", "p_yes", "p_no", + "confidence", "prediction", "Correct", "input_tokens", @@ -291,29 +297,32 @@ def run_benchmark(kwargs): if __name__ == "__main__": kwargs = {} - kwargs["num_questions"] = 10 + # kwargs["num_questions"] = 2 kwargs["tools"] = [ - # "prediction-online", + "prediction-online", # "prediction-offline", + # "claude-prediction-online", + # "claude-prediction-offline", # "prediction-online-summarized-info", # "prediction-offline-sme", # "prediction-online-sme", - #"prediction-request-rag", - "prediction-request-rag-cohere", + # "prediction-request-rag", + # "prediction-request-rag-claude", + # "prediction-request-rag-cohere", # "prediction-request-reasoning", - #"prediction-url-cot", + # "prediction-request-reasoning-claude", + # "prediction-url-cot", + # "prediction-url-cot-claude", # "prediction-with-research-conservative", # "prediction-with-research-bold", ] - # kwargs["llm_provider"] = "anthropic" - kwargs["llm_provider"] = "openrouter" kwargs["model"] = [ # only supports running for one model (takes first in list) # "claude-3-haiku-20240307", # "claude-3-sonnet-20240229", # "claude-3-opus-20240229", - # "gpt-3.5-turbo-0125", + "gpt-3.5-turbo-0125", # "gpt-4-0125-preview", - "cohere/command-r-plus", + # "cohere/command-r-plus", # "databricks/dbrx-instruct:nitro" # "nousresearch/nous-hermes-2-mixtral-8x7b-sft" ] diff --git a/benchmark/benchmark/run_marketclosing.py b/benchmark/benchmark/run_marketclosing.py index f169d6d..01ca122 100755 --- a/benchmark/benchmark/run_marketclosing.py +++ b/benchmark/benchmark/run_marketclosing.py @@ -7,6 +7,7 @@ from mech.packages.napthaai.customs.resolve_market_reasoning import ( resolve_market_reasoning, ) +from mech.packages.kongzii.customs.ofv_market_resolver import ofv_market_resolver import os import openai import pandas as pd @@ -31,6 +32,8 @@ def tool_map(tool): "resolve-market-reasoning-gpt-4", ]: return resolve_market_reasoning + elif tool in ["ofv-market-resolver"]: + return ofv_market_resolver else: raise Exception(f"Tool {tool} not found.") @@ -70,16 +73,14 @@ def parse_response(response, test_q): else: test_q["prediction"] = None - test_q["reasoning"] = response[1].replace(os.linesep, "") - test_q["prompt_response"] = response[2].replace(os.linesep, "") - test_q["queries"] = response[3] + test_q["prompt_response"] = response[1].replace(os.linesep, "") if test_q["prediction"] in ("yes", "no"): test_q["Correct"] = test_q["prediction"] == test_q["answer"] else: test_q["Correct"] = None - if response[4] is not None: + if response[3] is not None: test_q["input_tokens"] = response[4].cost_dict["input_tokens"] test_q["output_tokens"] = response[4].cost_dict["output_tokens"] test_q["total_tokens"] = response[4].cost_dict["total_tokens"] @@ -249,8 +250,9 @@ def run_benchmark(kwargs): kwargs["num_questions"] = 5 kwargs["tools"] = [ # "resolve-market", - "resolve-market-reasoning-gpt-3.5-turbo", + # "resolve-market-reasoning-gpt-3.5-turbo", # "resolve-market-reasoning-gpt-4", + "ofv-market-resolver", ] kwargs["api_keys"] = {} kwargs["api_keys"]["openai"] = os.getenv("OPENAI_API_KEY") @@ -258,6 +260,7 @@ def run_benchmark(kwargs): kwargs["api_keys"]["google_api_key"] = os.getenv("google_api_key") kwargs["api_keys"]["google_engine_id"] = os.getenv("google_engine_id") kwargs["api_keys"]["newsapi"] = os.getenv("NEWS_API_KEY") + kwargs["api_keys"]["serperapi"] = os.getenv("SERPER_API_KEY") kwargs["num_urls"] = 3 kwargs["num_words"] = 300 run_benchmark(kwargs) diff --git a/benchmark/benchmark/scripts/write_summary.py b/benchmark/benchmark/scripts/write_summary.py new file mode 100644 index 0000000..a053667 --- /dev/null +++ b/benchmark/benchmark/scripts/write_summary.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +from benchmark.utils import get_logger, TokenCounterCallback +import os +import pandas as pd +from pathlib import Path + +logger = get_logger(__name__) + +def write_results(csv_file_path): + + results_path = Path(csv_file_path.parent) + time_string = csv_file_path.stem.split("_", 1)[-1] + + results_df = pd.read_csv(csv_file_path) + num_errors = results_df["error"].count() + logger.info(f"Num errors: {str(num_errors)}") + results_df = results_df.dropna(subset=["prediction"]) + grouped_df = results_df.groupby(["tool", "model"]).agg( + { + "Correct": ["mean", "sum", "count"], + "crowd_correct": ["mean"], + "input_tokens": ["mean"], + "output_tokens": ["mean"], + "total_tokens": ["mean"], + "input_cost": ["mean"], + "output_cost": ["mean"], + "total_cost": ["mean"], + } + ) + + grouped_df.columns = ["_".join(col).strip() for col in grouped_df.columns.values] + summary_df = grouped_df.reset_index().rename( + columns={ + "Correct_mean": "accuracy", + "Correct_sum": "correct", + "Correct_count": "total", + "crowd_correct_mean": "crowd_accuracy", + } + ) + + logger.info(f"Results:\n\n {results_df}") + summary_df.to_csv(results_path / f"summary_{time_string}.csv", index=False) + +if __name__ == "__main__": + print(os.getcwd()) + results_path = Path("results") + + csv_file_path = results_path / f"results_240418124558.csv" + + write_results(csv_file_path) \ No newline at end of file