Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chore/misc #10

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ OPENAI_API_KEY=
ANTHROPIC_API_KEY=
google_api_key=
google_engine_id=
NEWS_API_KEY=
NEWS_API_KEY=
SERPER_API_KEY=
29 changes: 19 additions & 10 deletions benchmark/benchmark/run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
from mech.packages.napthaai.customs.prediction_url_cot import (
prediction_url_cot,
)

from mech.packages.napthaai.customs.prediction_request_rag_cohere import prediction_request_rag_cohere

import time
Expand All @@ -48,13 +47,17 @@ def tool_map(tool):
tool_dict = {
"prediction-online": prediction_request,
"prediction-offline": prediction_request,
"claude-prediction-offline": prediction_request,
"prediction-online-summarized-info": prediction_request,
"prediction-offline-sme": prediction_request_sme,
"prediction-online-sme": prediction_request_sme,
"prediction-request-rag": prediction_request_rag,
"prediction-request-rag-claude": prediction_request_rag,
"prediction-request-rag-cohere": prediction_request_rag_cohere,
"prediction-request-reasoning": prediction_request_reasoning,
"prediction-request-reasoning-claude": prediction_request_reasoning,
"prediction-url-cot": prediction_url_cot,
"prediction-url-cot-claude": prediction_url_cot,
"prediction-with-research-conservative": prediction_with_research_report,
"prediction-with-research-bold": prediction_with_research_report,
}
Expand Down Expand Up @@ -107,6 +110,8 @@ def parse_response(response, test_q):
else:
test_q["p_no"] = None

test_q["confidence"] = result["confidence"]

if response[3] is not None:
test_q["input_tokens"] = response[3].cost_dict["input_tokens"]
test_q["output_tokens"] = response[3].cost_dict["output_tokens"]
Expand Down Expand Up @@ -188,6 +193,7 @@ def run_benchmark(kwargs):
"model",
"p_yes",
"p_no",
"confidence",
"prediction",
"Correct",
"input_tokens",
Expand Down Expand Up @@ -291,29 +297,32 @@ def run_benchmark(kwargs):

if __name__ == "__main__":
kwargs = {}
kwargs["num_questions"] = 10
# kwargs["num_questions"] = 2
kwargs["tools"] = [
# "prediction-online",
"prediction-online",
# "prediction-offline",
# "claude-prediction-online",
# "claude-prediction-offline",
# "prediction-online-summarized-info",
# "prediction-offline-sme",
# "prediction-online-sme",
#"prediction-request-rag",
"prediction-request-rag-cohere",
# "prediction-request-rag",
# "prediction-request-rag-claude",
# "prediction-request-rag-cohere",
# "prediction-request-reasoning",
#"prediction-url-cot",
# "prediction-request-reasoning-claude",
# "prediction-url-cot",
# "prediction-url-cot-claude",
# "prediction-with-research-conservative",
# "prediction-with-research-bold",
]
# kwargs["llm_provider"] = "anthropic"
kwargs["llm_provider"] = "openrouter"
kwargs["model"] = [ # only supports running for one model (takes first in list)
# "claude-3-haiku-20240307",
# "claude-3-sonnet-20240229",
# "claude-3-opus-20240229",
# "gpt-3.5-turbo-0125",
"gpt-3.5-turbo-0125",
# "gpt-4-0125-preview",
"cohere/command-r-plus",
# "cohere/command-r-plus",
# "databricks/dbrx-instruct:nitro"
# "nousresearch/nous-hermes-2-mixtral-8x7b-sft"
]
Expand Down
13 changes: 8 additions & 5 deletions benchmark/benchmark/run_marketclosing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from mech.packages.napthaai.customs.resolve_market_reasoning import (
resolve_market_reasoning,
)
from mech.packages.kongzii.customs.ofv_market_resolver import ofv_market_resolver
import os
import openai
import pandas as pd
Expand All @@ -31,6 +32,8 @@ def tool_map(tool):
"resolve-market-reasoning-gpt-4",
]:
return resolve_market_reasoning
elif tool in ["ofv-market-resolver"]:
return ofv_market_resolver
else:
raise Exception(f"Tool {tool} not found.")

Expand Down Expand Up @@ -70,16 +73,14 @@ def parse_response(response, test_q):
else:
test_q["prediction"] = None

test_q["reasoning"] = response[1].replace(os.linesep, "")
test_q["prompt_response"] = response[2].replace(os.linesep, "")
test_q["queries"] = response[3]
test_q["prompt_response"] = response[1].replace(os.linesep, "")

if test_q["prediction"] in ("yes", "no"):
test_q["Correct"] = test_q["prediction"] == test_q["answer"]
else:
test_q["Correct"] = None

if response[4] is not None:
if response[3] is not None:
test_q["input_tokens"] = response[4].cost_dict["input_tokens"]
test_q["output_tokens"] = response[4].cost_dict["output_tokens"]
test_q["total_tokens"] = response[4].cost_dict["total_tokens"]
Expand Down Expand Up @@ -249,15 +250,17 @@ def run_benchmark(kwargs):
kwargs["num_questions"] = 5
kwargs["tools"] = [
# "resolve-market",
"resolve-market-reasoning-gpt-3.5-turbo",
# "resolve-market-reasoning-gpt-3.5-turbo",
# "resolve-market-reasoning-gpt-4",
"ofv-market-resolver",
]
kwargs["api_keys"] = {}
kwargs["api_keys"]["openai"] = os.getenv("OPENAI_API_KEY")
kwargs["api_keys"]["anthropic"] = os.getenv("ANTHROPIC_API_KEY")
kwargs["api_keys"]["google_api_key"] = os.getenv("google_api_key")
kwargs["api_keys"]["google_engine_id"] = os.getenv("google_engine_id")
kwargs["api_keys"]["newsapi"] = os.getenv("NEWS_API_KEY")
kwargs["api_keys"]["serperapi"] = os.getenv("SERPER_API_KEY")
kwargs["num_urls"] = 3
kwargs["num_words"] = 300
run_benchmark(kwargs)
51 changes: 51 additions & 0 deletions benchmark/benchmark/scripts/write_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python

from benchmark.utils import get_logger, TokenCounterCallback
import os
import pandas as pd
from pathlib import Path

logger = get_logger(__name__)

def write_results(csv_file_path):

results_path = Path(csv_file_path.parent)
time_string = csv_file_path.stem.split("_", 1)[-1]

results_df = pd.read_csv(csv_file_path)
num_errors = results_df["error"].count()
logger.info(f"Num errors: {str(num_errors)}")
results_df = results_df.dropna(subset=["prediction"])
grouped_df = results_df.groupby(["tool", "model"]).agg(
{
"Correct": ["mean", "sum", "count"],
"crowd_correct": ["mean"],
"input_tokens": ["mean"],
"output_tokens": ["mean"],
"total_tokens": ["mean"],
"input_cost": ["mean"],
"output_cost": ["mean"],
"total_cost": ["mean"],
}
)

grouped_df.columns = ["_".join(col).strip() for col in grouped_df.columns.values]
summary_df = grouped_df.reset_index().rename(
columns={
"Correct_mean": "accuracy",
"Correct_sum": "correct",
"Correct_count": "total",
"crowd_correct_mean": "crowd_accuracy",
}
)

logger.info(f"Results:\n\n {results_df}")
summary_df.to_csv(results_path / f"summary_{time_string}.csv", index=False)

if __name__ == "__main__":
print(os.getcwd())
results_path = Path("results")

csv_file_path = results_path / f"results_240418124558.csv"

write_results(csv_file_path)