Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rfc: @ls.pytest.mark.parametrize interface #1199

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion python/docs/create_api_rst.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Script for auto-generating api_reference.rst."""

Check notice on line 1 in python/docs/create_api_rst.py

View workflow job for this annotation

GitHub Actions / benchmark

Benchmark results

......................................... create_5_000_run_trees: Mean +- std dev: 618 ms +- 42 ms ......................................... create_10_000_run_trees: Mean +- std dev: 1.20 sec +- 0.06 sec ......................................... create_20_000_run_trees: Mean +- std dev: 1.19 sec +- 0.05 sec ......................................... dumps_class_nested_py_branch_and_leaf_200x400: Mean +- std dev: 707 us +- 8 us ......................................... dumps_class_nested_py_leaf_50x100: Mean +- std dev: 25.1 ms +- 0.2 ms ......................................... dumps_class_nested_py_leaf_100x200: Mean +- std dev: 104 ms +- 2 ms ......................................... dumps_dataclass_nested_50x100: Mean +- std dev: 25.4 ms +- 0.3 ms ......................................... WARNING: the benchmark result may be unstable * the standard deviation (15.2 ms) is 23% of the mean (65.3 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydantic_nested_50x100: Mean +- std dev: 65.3 ms +- 15.2 ms ......................................... WARNING: the benchmark result may be unstable * the standard deviation (29.5 ms) is 14% of the mean (218 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydanticv1_nested_50x100: Mean +- std dev: 218 ms +- 29 ms

Check notice on line 1 in python/docs/create_api_rst.py

View workflow job for this annotation

GitHub Actions / benchmark

Comparison against main

+-----------------------------------------------+--------+----------------------+ | Benchmark | main | changes | +===============================================+========+======================+ | dumps_class_nested_py_branch_and_leaf_200x400 | 711 us | 707 us: 1.01x faster | +-----------------------------------------------+--------+----------------------+ | Geometric mean | (ref) | 1.00x faster | +-----------------------------------------------+--------+----------------------+ Benchmark hidden because not significant (8): dumps_pydantic_nested_50x100, dumps_pydanticv1_nested_50x100, create_5_000_run_trees, create_20_000_run_trees, dumps_class_nested_py_leaf_50x100, create_10_000_run_trees, dumps_class_nested_py_leaf_100x200, dumps_dataclass_nested_50x100

import importlib
import inspect
Expand Down Expand Up @@ -105,7 +105,9 @@
else (
"enum"
if issubclass(type_, Enum)
else "Pydantic" if issubclass(type_, BaseModel) else "Regular"
else "Pydantic"
if issubclass(type_, BaseModel)
else "Regular"
)
)
if hasattr(type_, "__slots__"):
Expand Down
Empty file.
93 changes: 93 additions & 0 deletions python/langsmith/pytest/mark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from __future__ import annotations

import inspect
from typing import Any, Callable, Optional

import pytest

from langsmith import evaluate
from langsmith.evaluation._runner import TARGET_T


def parametrize(
dataset_name: str,
target_fn: TARGET_T,
*,
client: Optional[Any] = None,
max_concurrency: Optional[int] = None,
) -> Callable:
"""Decorator to parametrize a test function with LangSmith dataset examples.

Args:
dataset_name: Name of the LangSmith dataset to use
target_fn: Function to test that takes inputs dict and returns outputs dict
client: Optional LangSmith client to use
max_concurrency: Optional max number of concurrent evaluations

Returns:
Decorated test function that will be parametrized with dataset examples.
"""

def decorator(test_fn: Callable) -> Callable:
# Verify test function signature
sig = inspect.signature(test_fn)
required_params = {"inputs", "outputs", "reference_outputs"}
if not all(param in sig.parameters for param in required_params):
raise ValueError(f"Test function must accept parameters: {required_params}")

def evaluator(run, example):
"""Evaluator that runs the test function and returns pass/fail result."""
try:
results = test_fn(
inputs=example.inputs,
outputs=run.outputs,
reference_outputs=example.outputs,
)
except AssertionError as e:
return {"score": 0.0, "key": "pass", "comment": str(e)}
except Exception as e:
return {
"score": 0.0,
"key": "pass",
"comment": f"Unexpected error: {str(e)}",
}
else:
if not results:
return {"score": 1.0, "key": "pass"}
elif "results" not in results:
results = {"results": results}
else:
pass
results["results"].append({"score": 1.0, "key": "pass"})
return results

@pytest.mark.parametrize(
"example_result",
evaluate(
target_fn,
data=dataset_name,
evaluators=[evaluator],
client=client,
max_concurrency=max_concurrency,
experiment_prefix=f"pytest_{test_fn.__name__}",
blocking=False,
),
)
# @functools.wraps(test_fn)
def wrapped(example_result):
"""Wrapped test function that gets parametrized with results."""
# Fail the test if the evaluation failed
eval_results = example_result["evaluation_results"]["results"]
if not eval_results:
pytest.fail("No evaluation results")

pass_result = [r for r in eval_results if r.key == "pass"][0]
if not pass_result.score:
error = pass_result.comment
pytest.fail(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How would you set failure conditions? I assume people don't want to actually fail if any evaluation fails?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which might mean allowing customizability on the interface on this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this only fails if the actual test raises an error (we need to add a manual pytest.fail for that bc we catch and log all errors in the wrapper L48). so it is customizable by default

f"Test failed for example {example_result['example'].id}: {error}"
)

return wrapped

return decorator
7 changes: 7 additions & 0 deletions python/tests/unit_tests/test_pytest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import langsmith as ls


@ls.pytest.mark.parametrize("Sample Dataset 3", (lambda x: x))
def test_parametrize(inputs, outputs, reference_outputs) -> list:
assert inputs == outputs
return [{"key": "foo", "value": "bar"}]
Loading