Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions test/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
env
4,701 changes: 4,701 additions & 0 deletions test/DSPyVanillaJac.csv

Large diffs are not rendered by default.

101 changes: 101 additions & 0 deletions test/DSPyVanillaJacGSM8k-370.csv

Large diffs are not rendered by default.

1,151 changes: 1,151 additions & 0 deletions test/DSPyVanillaJacGSM8k.csv

Large diffs are not rendered by default.

4,337 changes: 4,337 additions & 0 deletions test/ModelSweep-12-06-2024-2303-2.csv

Large diffs are not rendered by default.

18,785 changes: 18,785 additions & 0 deletions test/ModelSweep-12-06-2024-2303-300usd.csv

Large diffs are not rendered by default.

3,137 changes: 3,137 additions & 0 deletions test/ModelSweep-12-06-2024-2303.csv

Large diffs are not rendered by default.

59,348 changes: 59,348 additions & 0 deletions test/ModelSweep-17-06-2024-0850.csv

Large diffs are not rendered by default.

6,129 changes: 6,129 additions & 0 deletions test/ModelSweep-19-06-2024-1820.csv

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions test/ModelSweep-21-06-2024-0103.csv

Large diffs are not rendered by default.

112 changes: 112 additions & 0 deletions test/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
from pprint import pprint

from datasets.arrow_dataset import tempfile
from testkit import AIEvaluator, FactEvaluator, RunnerEvaluator
from datasets import load_dataset
import subprocess
import pandas as pd
import numpy as np


def codeRun(cmd: list[str], input: str):
with open("/tmp/STDIN.txt", "w") as inputFile:
inputFile.write(input)
with open("/tmp/STDIN.txt", "r") as inputFile:
return subprocess.check_output(cmd, stdin=inputFile).decode()


def save(res):
df = pd.DataFrame(
res,
columns=[
"Question",
"GivenAnswer",
"DSPyCOTResponse",
"JacRespose",
"DSPyCOTExactMatch",
"JacExactMatch",
"DSPyCOTFailed",
"JacFailed",
],
)
df.to_csv("DSPyVanillaJacGSM8k.csv")


ds = load_dataset("openai/gsm8k", "main", split="train")
train = ds.iter(batch_size=1)
res = []
for i in train:
question = i["question"][0]
answer_str: str = i["answer"][0]
dspyFailed = False
jacFailed = False
try:
dspyResponse = codeRun(
["python", "tested_code/gsm8k/dspy_vanilla_impl.py"], input=question
).strip()
except:
dspyFailed = True
dspyResponse = ""

try:
jacResponse = codeRun(
["jac", "run", "tested_code/gsm8k/jac_impl.jac"], input=question
).strip()
except:
jacFailed = True
jacResponse = ""

answer = answer_str.split(" ")[-1].replace(",", "")
# print(question, answer)
emEval = FactEvaluator(answer=answer)
comparison = emEval.eval(dspyResponse, jacResponse)
individual_test = [
question,
answer,
dspyResponse,
jacResponse,
comparison.scoreA.overall(),
comparison.scoreB.overall(),
dspyFailed,
jacFailed,
]
print(individual_test)
res.append(individual_test)
if len(res) % 10 == 0:
save(res)


# ds = load_dataset("hotpotqa/hotpot_qa", "fullwiki", split="train")
# train = ds.iter(batch_size=1)
# res = []
# print(ds)
# for i in train:
# if i['level'][0] != 'hard':
# continue
# question = f"{i['context'][0]} {i['question'][0]}"
# answer = i["answer"][0]
# dspyResponse = codeRun(
# ["python", "tested_code/qa/dspy_vanilla_impl.py"], input=question
# ).strip()
# jacResponse = codeRun(
# ["jac", "run", "tested_code/qa/jac_impl.jac"], input=question
# ).strip()
# print(answer)
# print("DSPy Response", dspyResponse)
# print("JAC Response", jacResponse.strip())
# emEval = FactEvaluator(answer=answer)
# comparison = emEval.eval(dspyResponse, jacResponse)
# print(comparison.scoreA, comparison.scoreB)
# res.append(
# [
# question,
# answer,
# dspyResponse,
# jacResponse,
# comparison.scoreA.overall(),
# comparison.scoreB.overall(),
# ]
# )
# if len(res) % 10 == 0:
# save(res)
# break
109 changes: 109 additions & 0 deletions test/multiRetryTest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from datetime import datetime
import os
from datasets import load_dataset
import subprocess
import pandas as pd
import json

# models = ["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo", "gpt-4o"]
models = ["gpt-4o"]


def codeRun(cmd: list[str], input: str, modelName: str):
subEnv = os.environ.copy()
subEnv["MODEL_NAME"] = modelName
with open("/tmp/STDIN.txt", "w") as inputFile:
inputFile.write(input)
with open("/tmp/STDIN.txt", "r") as inputFile:
start = datetime.now()
res = subprocess.check_output(cmd, stdin=inputFile, env=subEnv).decode()
duration = datetime.now() - start
return res, duration.total_seconds()



def save(res):
df = pd.DataFrame(
res,
columns=[
"QuestionID",
"Question",
"GivenAnswer",
"Model",
"Program",
"Output",
"ExactMatch",
"Failed",
"Time(s)",
"PromptTokens",
"CompletionTokens",
"RawPrompt",
"RawResponse",
],
)
df.to_csv("ModelSweep-21-06-2024-0103.csv")


ds = load_dataset("openai/gsm8k", "main", split="train")
train = ds.iter(batch_size=1)
res = []
count = 0
for i in train:
if (count == 300):
exit(0)
question = i["question"][0]
answer_str: str = i["answer"][0]
answer = answer_str.split(" ")[-1].replace(",", "")
for model in models:
print(f"Running Question {count} with model {model}")
dspyFailed = False
jacFailed = False
dspyRawPrompt = ""
jacRawPrompt = ""
try:
dspyResponse, dspyTimer = codeRun(
["python", "tested_code/gsm8k/dspy_vanilla_impl.py"], input=question, modelName=model
)
dspyResponse = dspyResponse.strip()
except KeyboardInterrupt:
exit(1)
except:
dspyFailed = True
dspyResponse = ""
dspyTimer = 0
print(dspyRawPrompt)
with open("RawPrompt.json", "r") as rawPromptFile, open("RawResponse.json", "r") as rawResponseFile:
dspyRawPrompt = rawPromptFile.read()
dspyRawResponse = json.load(rawResponseFile)
os.remove("RawPrompt.json")
os.remove("RawResponse.json")
dspyPromptTokens = sum([i["usage"]["prompt_tokens"] for i in dspyRawResponse])
dspyCompletionTokens = sum([i["usage"]["completion_tokens"] for i in dspyRawResponse])
dspyResult = [count, question, answer, model, "DSPy", dspyResponse, (dspyResponse == answer), dspyFailed, dspyTimer, dspyPromptTokens, dspyCompletionTokens, dspyRawPrompt, json.dumps(dspyRawResponse)]
print("DSPy Result", dspyResult)
res.append(dspyResult)

try:
jacResponse, jacTimer = codeRun(
["jac", "run", "tested_code/gsm8k/jac_impl.jac"], input=question, modelName=model
)
jacResponse = jacResponse.strip()
except KeyboardInterrupt:
exit(1)
except:
jacFailed = True
jacResponse = ""
jacTimer = 0
with open("RawPrompt.json", "r") as rawPromptFile, open("RawResponse.json", "r") as rawResponseFile:
jacRawPrompt = rawPromptFile.read()
jacRawResponse = json.load(rawResponseFile)
os.remove("RawPrompt.json")
os.remove("RawResponse.json")
jacPromptTokens = sum([i["usage"]["prompt_tokens"] for i in jacRawResponse])
jacCompletionTokens = sum([i["usage"]["completion_tokens"] for i in jacRawResponse])
jacResult = [count, question, answer, model, "Jac", jacResponse, (jacResponse == answer), jacFailed, jacTimer, jacPromptTokens, jacCompletionTokens, jacRawPrompt, json.dumps(jacRawResponse)]
print("Jac Result", jacResult)
res.append(jacResult)

save(res)
count += 1
105 changes: 105 additions & 0 deletions test/newTest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
from datetime import datetime
import os
from datasets import load_dataset
import subprocess
import pandas as pd
import json

# models = ["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo", "gpt-4o"]
models = ["gpt-4o"]


def codeRun(cmd: list[str], input: str, modelName: str):
subEnv = os.environ.copy()
subEnv["MODEL_NAME"] = modelName
with open("/tmp/STDIN.txt", "w") as inputFile:
inputFile.write(input)
with open("/tmp/STDIN.txt", "r") as inputFile:
start = datetime.now()
res = subprocess.check_output(cmd, stdin=inputFile, env=subEnv).decode()
duration = datetime.now() - start
return res, duration.total_seconds()



def save(res):
df = pd.DataFrame(
res,
columns=[
"QuestionID",
"Question",
"GivenAnswer",
"Model",
"Program",
"Output",
"ExactMatch",
"Failed",
"Time(s)",
"PromptTokens",
"CompletionTokens",
"RawPrompt",
"RawResponse",
],
)
df.to_csv("ModelSweep-19-06-2024-1820.csv")


ds = load_dataset("openai/gsm8k", "main", split="train")
train = ds.iter(batch_size=1)
res = []
count = 0
for i in train:
if (count == 300):
exit(0)
question = i["question"][0]
answer_str: str = i["answer"][0]
answer = answer_str.split(" ")[-1].replace(",", "")
for model in models:
print(f"Running Question {count} with model {model}")
dspyFailed = False
jacFailed = False
dspyRawPrompt = ""
jacRawPrompt = ""
try:
dspyResponse, dspyTimer = codeRun(
["python", "tested_code/gsm8k/dspy_vanilla_impl.py"], input=question, modelName=model
)
dspyResponse = dspyResponse.strip()
except KeyboardInterrupt:
exit(1)
except:
dspyFailed = True
dspyResponse = ""
dspyTimer = 0
print(dspyRawPrompt)
with open("RawPrompt.txt", "r") as rawPromptFile, open("RawResponse.json", "r") as rawResponseFile:
dspyRawPrompt = rawPromptFile.read()
dspyRawResponse = json.load(rawResponseFile)
os.remove("RawPrompt.txt")
os.remove("RawResponse.json")
dspyResult = [count, question, answer, model, "DSPy", dspyResponse, (dspyResponse == answer), dspyFailed, dspyTimer, dspyRawResponse["usage"]["prompt_tokens"], dspyRawResponse["usage"]["completion_tokens"], dspyRawPrompt, json.dumps(dspyRawResponse)]
print("DSPy Result", dspyResult)
res.append(dspyResult)

# try:
# jacResponse, jacTimer = codeRun(
# ["jac", "run", "tested_code/gsm8k/jac_impl.jac"], input=question, modelName=model
# )
# jacResponse = jacResponse.strip()
# except KeyboardInterrupt:
# exit(1)
# except:
# jacFailed = True
# jacResponse = ""
# jacTimer = 0
# with open("RawPrompt.txt", "r") as rawPromptFile, open("RawResponse.json", "r") as rawResponseFile:
# jacRawPrompt = rawPromptFile.read()
# jacRawResponse = json.load(rawResponseFile)
# os.remove("RawPrompt.txt")
# os.remove("RawResponse.json")
# jacResult = [count, question, answer, model, "Jac", jacResponse, (jacResponse == answer), jacFailed, jacTimer, jacRawResponse["usage"]["prompt_tokens"], jacRawResponse["usage"]["completion_tokens"], jacRawPrompt, json.dumps(jacRawResponse)]
# print("Jac Result", jacResult)
# res.append(jacResult)

save(res)
count += 1
10 changes: 10 additions & 0 deletions test/tested_code/gsm8k/dspy_vanilla_impl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import dspy
import os
model_name = os.environ["MODEL_NAME"]
llm = dspy.OpenAI(model=model_name, max_tokens=1000)
dspy.configure(lm=llm)
question = input()
answer = dspy.TypedChainOfThought('question:str -> answer:int')
response = answer(question=question)

print(response.answer)
11 changes: 11 additions & 0 deletions test/tested_code/gsm8k/jac_impl.jac
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import:py from os, environ;
import:py from jaclang.core.llms, Ollama, OpenAI;
glob model_name=environ["MODEL_NAME"];
glob llm = OpenAI(model_name=model_name);
can answerquestion(question: str) -> 'answer': int by llm(method="Chain-of-Thoughts");

with entry{
question = input();
answer = answerquestion(question);
print(answer);
}
8 changes: 8 additions & 0 deletions test/tested_code/qa/dspy_vanilla_impl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import dspy
llm = dspy.OpenAI()
dspy.configure(lm=llm)
question = input()
answer = dspy.ChainOfThought('question -> answer')
response = answer(question=question)

print(response.answer)
9 changes: 9 additions & 0 deletions test/tested_code/qa/jac_impl.jac
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import:py from jaclang.core.llms, Ollama, OpenAI;
glob llm = OpenAI();
can answerquestion(question: str) -> 'answer': str by llm();

with entry{
question = input();
answer = answerquestion(question);
print(answer);
}
Loading