Adds some tests

skrawcz · skrawcz · commit a7927595d224 · 2024-12-03T23:20:58.000-08:00
This shows how to use pytest to test an action.

TODO:
 - how to use burr fixture
 - how to test agent and use tracker
diff --git a/examples/pytest/conftest.py b/examples/pytest/conftest.py
@@ -1,2 +1,25 @@
-# examples/pytest/conftest.py
-# TODO: utility functions for pytest fixtures
+import pytest
+
+
+class ResultCollector:
+    """Example of a custom fixture that collects results from tests."""
+
+    def __init__(self):
+        self.results = []
+
+    def append(self, result):
+        self.results.append(result)
+
+    def values(self):
+        return self.results
+
+    def __str__(self):
+        return "\n".join(str(result) for result in self.results)
+
+
+@pytest.fixture(scope="session")
+def result_collector():
+    """Fixture that collects results from tests. This is a toy example."""
+    collector = ResultCollector()
+    yield collector
+    print("\nCollected Results:\n", collector)
diff --git a/examples/pytest/diagnosis.png b/examples/pytest/diagnosis.png
diff --git a/examples/pytest/some_actions.py b/examples/pytest/some_actions.py
@@ -8,22 +8,6 @@
 from burr.core import Action, ApplicationContext, GraphBuilder, State, action
 from burr.core.parallelism import MapStates, RunnableGraph
 
-# @action(reads=["input"], writes=["response"])
-# def some_assistant_action(state: State, client: openai.Client) -> State:
-#     # get the input from the state
-#     input = state.get("input")
-#     # call the LLM
-#     response = client.chat.completions.create(
-#         messages=[
-#             {"role": "system", "content": "You are a helpful assistant."},
-#             {"role": "user", "content": input},
-#         ],
-#         model="gpt-4o-mini",
-#     )
-#     # update the state with the response
-#     return state.update(response=response.choices[0].message)
-#
-
 
 @action(reads=["audio"], writes=["transcription"])
 def transcribe_audio(state: State) -> State:
@@ -106,7 +90,9 @@ def determine_diagnosis(state: State) -> State:
         return state.update(final_diagnosis="Healthy individual")
 
 
-def run_my_agent(input_audio: str) -> Tuple[str, str]:
+def run_my_agent(
+    input_audio: str, partition_key: str = None, app_id: str = None, tracking_project: str = None
+) -> Tuple[str, str]:
     # we fake the input audio to be a string here rather than a waveform.
     graph = (
         GraphBuilder()
@@ -121,13 +107,17 @@ def run_my_agent(input_audio: str) -> Tuple[str, str]:
         )
         .build()
     )
-    app = (
+    app_builder = (
         core.ApplicationBuilder()
         .with_graph(graph)
         .with_state(**{"audio": input_audio})
         .with_entrypoint("transcribe_audio")
-        .build()
+        .with_identifiers(partition_key=partition_key, app_id=app_id)
     )
+    if tracking_project:
+        app_builder = app_builder.with_tracker(project=tracking_project)
+    app = app_builder.build()
+    # app.visualize("diagnosis.png", include_conditions=True, view=False, format="png")
     last_action, _, agent_state = app.run(
         halt_after=["determine_diagnosis"],
         inputs={"audio": input_audio},
diff --git a/examples/pytest/test_some_actions.py b/examples/pytest/test_some_actions.py
@@ -1,36 +1,107 @@
+"""This module shows example tests for testing actions and agents."""
 import pytest
 
+from burr.core import state
 
-# examples/pytest/test_example.py
-def test_example(result_collector):
+from examples.pytest import some_actions
+
+
+def test_example1(result_collector):
+    """Example test that uses a custom fixture."""
     result_collector.append("Test result 1")
     result_collector.append("Test result 2")
     assert True
 
 
-@pytest.mark.parametrize("sample_idx", range(3))
-def test_1(sample_idx, results_bag):
-    results_bag.input = "..."
-    results_bag.actual = "foo bar"
-    results_bag.expected = "foo bar baz"
-    results_bag.cosine = 0.8
-    results_bag.jaccard = 0.6
-    results_bag.llm = sample_idx
-
-
-def test_2(results_bag):
+def test_example2(results_bag):
+    """Example that uses pytest-harvest results_bag fixture."""
+    # the following become columns in the final results
     results_bag.input = "..."
     results_bag.actual = "foo"
     results_bag.expected = "foo bar baz"
     results_bag.cosine = 0.3
     results_bag.jaccard = 0.2
-    print("hi")
-    assert False
+    assert True
+
+
+def test_example3(module_results_df):
+    """Example that shows how to access the module_results_df fixture."""
+    # note pytest runs these tests in order - so in practice this
+    # would be placed at the end of the test file
+    print(module_results_df.columns)
+
+
+def test_run_hypothesis(results_bag):
+    """Tests the run_hypothesis action for a single case"""
+    input = "Patient has a limp and is unable to flex right ankle. Ankle is swollen."
+    hypothesis = "Common cold"
+    expected = "no"
+    results_bag.input = input
+    results_bag.expected = expected
+    results_bag.test_function = "test_run_hypothesis"
+    input_state = state.State({"hypothesis": hypothesis, "transcription": input})
+    end_state = some_actions.run_hypothesis(input_state)
+    results_bag.actual = end_state["diagnosis"]
+    results_bag.exact_match = end_state["diagnosis"].lower() == expected
+    # results_bag.jaccard = ... # other measures here
+    # e.g. LLM as judge if applicable
+    # place asserts at end
+    assert end_state["diagnosis"] is not None
+    assert end_state["diagnosis"] != ""
+
+
+@pytest.mark.parametrize(
+    "input,hypothesis,expected",
+    [
+        ("Patient exhibits mucus dripping from nostrils and coughing.", "Common cold", "yes"),
+        (
+            "Patient has a limp and is unable to flex right ankle. Ankle is swollen.",
+            "Sprained ankle",
+            "yes",
+        ),
+        (
+            "Patient fell off and landed on their right arm. Their right wrist is swollen, "
+            "they can still move their fingers, and there is only minor pain or discomfort when the wrist is moved or "
+            "touched.",
+            "Broken arm",
+            "no",
+        ),
+    ],
+    ids=["common_cold", "sprained_ankle", "broken_arm"],
+)
+def test_run_hypothesis_parameterized(input, hypothesis, expected, results_bag):
+    """Example showing how to parameterize this."""
+    results_bag.input = input
+    results_bag.expected = expected
+    results_bag.test_function = "test_run_hypothesis_parameterized"
+    input_state = state.State({"hypothesis": hypothesis, "transcription": input})
+    end_state = some_actions.run_hypothesis(input_state)
+    results_bag.actual = end_state["diagnosis"]
+    results_bag.exact_match = end_state["diagnosis"].lower() == expected
+    # results_bag.jaccard = ... # other measures here
+    # e.g. LLM as judge if applicable
+    # place asserts at end
+    assert end_state["diagnosis"] is not None
+    assert end_state["diagnosis"] != ""
+
+
+def test_run_hypothesis_burr_fixture(input, hypothesis, expected, results_bag):
+    """This example shows how to scale parameterized with a file of inputs and expected outputs."""
 
 
 def test_print_results(module_results_df):
     print(module_results_df.columns)
     print(module_results_df.head())
-    # save to CSV
-    # upload to google sheets
     # compute statistics
+    # this is where you could use pandas to compute statistics like accuracy, etc.
+    tests_of_interest = module_results_df[
+        module_results_df["test_function"].fillna("").str.startswith("test_run_hypothesis")
+    ]
+    accuracy = sum(tests_of_interest["exact_match"]) / len(tests_of_interest)
+    # save to CSV
+    tests_of_interest[
+        ["test_function", "duration_ms", "status", "input", "expected", "actual", "exact_match"]
+    ].to_csv("results.csv", index=True, quoting=1)
+    # upload to google sheets or other storage, etc.
+
+    assert accuracy > 0.9  # and then assert on the computed statistics