jaseci-labs · Gorgeous-Patrick · Jun 20, 2024 · Jun 21, 2024
diff --git a/test/.gitignore b/test/.gitignore
@@ -0,0 +1 @@
+env
diff --git a/test/DSPyVanillaJac.csv b/test/DSPyVanillaJac.csv
diff --git a/test/DSPyVanillaJacGSM8k-370.csv b/test/DSPyVanillaJacGSM8k-370.csv
diff --git a/test/DSPyVanillaJacGSM8k.csv b/test/DSPyVanillaJacGSM8k.csv
diff --git a/test/ModelSweep-12-06-2024-2303-2.csv b/test/ModelSweep-12-06-2024-2303-2.csv
diff --git a/test/ModelSweep-12-06-2024-2303-300usd.csv b/test/ModelSweep-12-06-2024-2303-300usd.csv
diff --git a/test/ModelSweep-12-06-2024-2303.csv b/test/ModelSweep-12-06-2024-2303.csv
diff --git a/test/ModelSweep-17-06-2024-0850.csv b/test/ModelSweep-17-06-2024-0850.csv
diff --git a/test/ModelSweep-19-06-2024-1820.csv b/test/ModelSweep-19-06-2024-1820.csv
diff --git a/test/ModelSweep-21-06-2024-0103.csv b/test/ModelSweep-21-06-2024-0103.csv
diff --git a/test/main.py b/test/main.py
@@ -0,0 +1,112 @@
+from pprint import pprint
+
+from datasets.arrow_dataset import tempfile
+from testkit import AIEvaluator, FactEvaluator, RunnerEvaluator
+from datasets import load_dataset
+import subprocess
+import pandas as pd
+import numpy as np
+
+
+def codeRun(cmd: list[str], input: str):
+    with open("/tmp/STDIN.txt", "w") as inputFile:
+        inputFile.write(input)
+    with open("/tmp/STDIN.txt", "r") as inputFile:
+        return subprocess.check_output(cmd, stdin=inputFile).decode()
+
+
+def save(res):
+    df = pd.DataFrame(
+        res,
+        columns=[
+            "Question",
+            "GivenAnswer",
+            "DSPyCOTResponse",
+            "JacRespose",
+            "DSPyCOTExactMatch",
+            "JacExactMatch",
+            "DSPyCOTFailed",
+            "JacFailed",
+        ],
+    )
+    df.to_csv("DSPyVanillaJacGSM8k.csv")
+
+
+ds = load_dataset("openai/gsm8k", "main", split="train")
+train = ds.iter(batch_size=1)
+res = []
+for i in train:
+    question = i["question"][0]
+    answer_str: str = i["answer"][0]
+    dspyFailed = False
+    jacFailed = False
+    try:
+        dspyResponse = codeRun(
+            ["python", "tested_code/gsm8k/dspy_vanilla_impl.py"], input=question
+        ).strip()
+    except:
+        dspyFailed = True
+        dspyResponse = ""
+
+    try:
+        jacResponse = codeRun(
+            ["jac", "run", "tested_code/gsm8k/jac_impl.jac"], input=question
+        ).strip()
+    except:
+        jacFailed = True
+        jacResponse = ""
+
+    answer = answer_str.split(" ")[-1].replace(",", "")
+    # print(question, answer)
+    emEval = FactEvaluator(answer=answer)
+    comparison = emEval.eval(dspyResponse, jacResponse)
+    individual_test = [
+        question,
+        answer,
+        dspyResponse,
+        jacResponse,
+        comparison.scoreA.overall(),
+        comparison.scoreB.overall(),
+        dspyFailed,
+        jacFailed,
+    ]
+    print(individual_test)
+    res.append(individual_test)
+    if len(res) % 10 == 0:
+        save(res)
+
+
+# ds = load_dataset("hotpotqa/hotpot_qa", "fullwiki", split="train")
+# train = ds.iter(batch_size=1)
+# res = []
+# print(ds)
+# for i in train:
+#     if i['level'][0] != 'hard':
+#         continue
+#     question = f"{i['context'][0]} {i['question'][0]}"
+#     answer = i["answer"][0]
+#     dspyResponse = codeRun(
+#         ["python", "tested_code/qa/dspy_vanilla_impl.py"], input=question
+#     ).strip()
+#     jacResponse = codeRun(
+#         ["jac", "run", "tested_code/qa/jac_impl.jac"], input=question
+#     ).strip()
+#     print(answer)
+#     print("DSPy Response", dspyResponse)
+#     print("JAC Response", jacResponse.strip())
+#     emEval = FactEvaluator(answer=answer)
+#     comparison = emEval.eval(dspyResponse, jacResponse)
+#     print(comparison.scoreA, comparison.scoreB)
+#     res.append(
+#         [
+#             question,
+#             answer,
+#             dspyResponse,
+#             jacResponse,
+#             comparison.scoreA.overall(),
+#             comparison.scoreB.overall(),
+#         ]
+#     )
+#     if len(res) % 10 == 0:
+#         save(res)
+# break
diff --git a/test/multiRetryTest.py b/test/multiRetryTest.py
@@ -0,0 +1,109 @@
+from datetime import datetime
+import os
+from datasets import load_dataset
+import subprocess
+import pandas as pd
+import json
+
+# models = ["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo", "gpt-4o"]
+models = ["gpt-4o"]
+
+
+def codeRun(cmd: list[str], input: str, modelName: str):
+    subEnv = os.environ.copy()
+    subEnv["MODEL_NAME"] = modelName
+    with open("/tmp/STDIN.txt", "w") as inputFile:
+        inputFile.write(input)
+    with open("/tmp/STDIN.txt", "r") as inputFile:
+        start = datetime.now()
+        res = subprocess.check_output(cmd, stdin=inputFile, env=subEnv).decode()
+        duration = datetime.now() - start
+        return res, duration.total_seconds()
+
+
+
+def save(res):
+    df = pd.DataFrame(
+        res,
+        columns=[
+            "QuestionID",
+            "Question",
+            "GivenAnswer",
+            "Model",
+            "Program",
+            "Output",
+            "ExactMatch",
+            "Failed",
+            "Time(s)",
+            "PromptTokens",
+            "CompletionTokens",
+            "RawPrompt",
+            "RawResponse",
+        ],
+    )
+    df.to_csv("ModelSweep-21-06-2024-0103.csv")
+
+
+ds = load_dataset("openai/gsm8k", "main", split="train")
+train = ds.iter(batch_size=1)
+res = []
+count = 0
+for i in train:
+    if (count == 300):
+        exit(0)
+    question = i["question"][0]
+    answer_str: str = i["answer"][0]
+    answer = answer_str.split(" ")[-1].replace(",", "")
+    for model in models:
+        print(f"Running Question {count} with model {model}")
+        dspyFailed = False
+        jacFailed = False
+        dspyRawPrompt = ""
+        jacRawPrompt = ""
+        try:
+            dspyResponse, dspyTimer = codeRun(
+                ["python", "tested_code/gsm8k/dspy_vanilla_impl.py"], input=question, modelName=model
+            )
+            dspyResponse = dspyResponse.strip()
+        except KeyboardInterrupt:
+            exit(1)
+        except:
+            dspyFailed = True
+            dspyResponse = ""
+            dspyTimer = 0
+            print(dspyRawPrompt)
+        with open("RawPrompt.json", "r") as rawPromptFile, open("RawResponse.json", "r") as rawResponseFile:
+            dspyRawPrompt = rawPromptFile.read()
+            dspyRawResponse = json.load(rawResponseFile)
+        os.remove("RawPrompt.json")
+        os.remove("RawResponse.json")
+        dspyPromptTokens = sum([i["usage"]["prompt_tokens"] for i in dspyRawResponse])
+        dspyCompletionTokens = sum([i["usage"]["completion_tokens"] for i in dspyRawResponse])
+        dspyResult = [count, question, answer, model, "DSPy", dspyResponse, (dspyResponse == answer), dspyFailed, dspyTimer, dspyPromptTokens, dspyCompletionTokens, dspyRawPrompt, json.dumps(dspyRawResponse)]
+        print("DSPy Result", dspyResult)
+        res.append(dspyResult)
+
+        try:
+            jacResponse, jacTimer = codeRun(
+                ["jac", "run", "tested_code/gsm8k/jac_impl.jac"], input=question, modelName=model
+            )
+            jacResponse = jacResponse.strip()
+        except KeyboardInterrupt:
+            exit(1)
+        except:
+            jacFailed = True
+            jacResponse = ""
+            jacTimer = 0
+        with open("RawPrompt.json", "r") as rawPromptFile, open("RawResponse.json", "r") as rawResponseFile:
+            jacRawPrompt = rawPromptFile.read()
+            jacRawResponse = json.load(rawResponseFile)
+        os.remove("RawPrompt.json")
+        os.remove("RawResponse.json")
+        jacPromptTokens = sum([i["usage"]["prompt_tokens"] for i in jacRawResponse])
+        jacCompletionTokens = sum([i["usage"]["completion_tokens"] for i in jacRawResponse])
+        jacResult = [count, question, answer, model, "Jac", jacResponse, (jacResponse == answer), jacFailed, jacTimer, jacPromptTokens, jacCompletionTokens, jacRawPrompt, json.dumps(jacRawResponse)]
+        print("Jac Result", jacResult)
+        res.append(jacResult)
+
+    save(res)
+    count += 1
diff --git a/test/newTest.py b/test/newTest.py
@@ -0,0 +1,105 @@
+from datetime import datetime
+import os
+from datasets import load_dataset
+import subprocess
+import pandas as pd
+import json
+
+# models = ["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo", "gpt-4o"]
+models = ["gpt-4o"]
+
+
+def codeRun(cmd: list[str], input: str, modelName: str):
+    subEnv = os.environ.copy()
+    subEnv["MODEL_NAME"] = modelName
+    with open("/tmp/STDIN.txt", "w") as inputFile:
+        inputFile.write(input)
+    with open("/tmp/STDIN.txt", "r") as inputFile:
+        start = datetime.now()
+        res = subprocess.check_output(cmd, stdin=inputFile, env=subEnv).decode()
+        duration = datetime.now() - start
+        return res, duration.total_seconds()
+
+
+
+def save(res):
+    df = pd.DataFrame(
+        res,
+        columns=[
+            "QuestionID",
+            "Question",
+            "GivenAnswer",
+            "Model",
+            "Program",
+            "Output",
+            "ExactMatch",
+            "Failed",
+            "Time(s)",
+            "PromptTokens",
+            "CompletionTokens",
+            "RawPrompt",
+            "RawResponse",
+        ],
+    )
+    df.to_csv("ModelSweep-19-06-2024-1820.csv")
+
+
+ds = load_dataset("openai/gsm8k", "main", split="train")
+train = ds.iter(batch_size=1)
+res = []
+count = 0
+for i in train:
+    if (count == 300):
+        exit(0)
+    question = i["question"][0]
+    answer_str: str = i["answer"][0]
+    answer = answer_str.split(" ")[-1].replace(",", "")
+    for model in models:
+        print(f"Running Question {count} with model {model}")
+        dspyFailed = False
+        jacFailed = False
+        dspyRawPrompt = ""
+        jacRawPrompt = ""
+        try:
+            dspyResponse, dspyTimer = codeRun(
+                ["python", "tested_code/gsm8k/dspy_vanilla_impl.py"], input=question, modelName=model
+            )
+            dspyResponse = dspyResponse.strip()
+        except KeyboardInterrupt:
+            exit(1)
+        except:
+            dspyFailed = True
+            dspyResponse = ""
+            dspyTimer = 0
+            print(dspyRawPrompt)
+        with open("RawPrompt.txt", "r") as rawPromptFile, open("RawResponse.json", "r") as rawResponseFile:
+            dspyRawPrompt = rawPromptFile.read()
+            dspyRawResponse = json.load(rawResponseFile)
+        os.remove("RawPrompt.txt")
+        os.remove("RawResponse.json")
+        dspyResult = [count, question, answer, model, "DSPy", dspyResponse, (dspyResponse == answer), dspyFailed, dspyTimer, dspyRawResponse["usage"]["prompt_tokens"], dspyRawResponse["usage"]["completion_tokens"], dspyRawPrompt, json.dumps(dspyRawResponse)]
+        print("DSPy Result", dspyResult)
+        res.append(dspyResult)
+
+        # try:
+        #     jacResponse, jacTimer = codeRun(
+        #         ["jac", "run", "tested_code/gsm8k/jac_impl.jac"], input=question, modelName=model
+        #     )
+        #     jacResponse = jacResponse.strip()
+        # except KeyboardInterrupt:
+        #     exit(1)
+        # except:
+        #     jacFailed = True
+        #     jacResponse = ""
+        #     jacTimer = 0
+        # with open("RawPrompt.txt", "r") as rawPromptFile, open("RawResponse.json", "r") as rawResponseFile:
+        #     jacRawPrompt = rawPromptFile.read()
+        #     jacRawResponse = json.load(rawResponseFile)
+        # os.remove("RawPrompt.txt")
+        # os.remove("RawResponse.json")
+        # jacResult = [count, question, answer, model, "Jac", jacResponse, (jacResponse == answer), jacFailed, jacTimer, jacRawResponse["usage"]["prompt_tokens"], jacRawResponse["usage"]["completion_tokens"], jacRawPrompt, json.dumps(jacRawResponse)]
+        # print("Jac Result", jacResult)
+        # res.append(jacResult)
+
+    save(res)
+    count += 1
diff --git a/test/tested_code/gsm8k/dspy_vanilla_impl.py b/test/tested_code/gsm8k/dspy_vanilla_impl.py
@@ -0,0 +1,10 @@
+import dspy
+import os
+model_name = os.environ["MODEL_NAME"]
+llm = dspy.OpenAI(model=model_name, max_tokens=1000)
+dspy.configure(lm=llm)
+question = input()
+answer = dspy.TypedChainOfThought('question:str -> answer:int')
+response = answer(question=question)
+
+print(response.answer)
diff --git a/test/tested_code/gsm8k/jac_impl.jac b/test/tested_code/gsm8k/jac_impl.jac
@@ -0,0 +1,11 @@
+import:py from os, environ;
+import:py from jaclang.core.llms, Ollama, OpenAI;
+glob model_name=environ["MODEL_NAME"];
+glob llm = OpenAI(model_name=model_name);
+can answerquestion(question: str) -> 'answer': int by llm(method="Chain-of-Thoughts");
+
+with entry{
+    question = input();
+    answer = answerquestion(question);
+    print(answer);
+}
diff --git a/test/tested_code/qa/dspy_vanilla_impl.py b/test/tested_code/qa/dspy_vanilla_impl.py
@@ -0,0 +1,8 @@
+import dspy
+llm = dspy.OpenAI()
+dspy.configure(lm=llm)
+question = input()
+answer = dspy.ChainOfThought('question -> answer')
+response = answer(question=question)
+
+print(response.answer)
diff --git a/test/tested_code/qa/jac_impl.jac b/test/tested_code/qa/jac_impl.jac
@@ -0,0 +1,9 @@
+import:py from jaclang.core.llms, Ollama, OpenAI;
+glob llm = OpenAI();
+can answerquestion(question: str) -> 'answer': str by llm();
+
+with entry{
+    question = input();
+    answer = answerquestion(question);
+    print(answer);
+}