Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions agent-eval-mlflow-otel/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
__pycache__/
*.pyc
.env
*.egg-info/
results/
output/
*.jsonl
!fixtures/*.yaml
venv/
.venv/
299 changes: 299 additions & 0 deletions agent-eval-mlflow-otel/README.md

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
106 changes: 106 additions & 0 deletions agent-eval-mlflow-otel/examples/check_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""Compare recent trace scores against golden baselines.

Loads a JSONL golden baseline file, computes mean scores per scorer,
then compares recent traces. Flags regression if any scorer drops
more than THRESHOLD below the baseline mean.

Usage:
python3 check_regression.py --agent explore --strict
python3 check_regression.py --agent explore --days 14 --threshold 0.15
"""
import argparse
import json
import os
import sys

import mlflow

DEFAULT_THRESHOLD = 0.10


def connect():
url = os.environ.get("MLFLOW_TRACKING_URI", "")
token = os.environ.get("MLFLOW_OTLP_TOKEN", "")
if token:
os.environ.setdefault("MLFLOW_TRACKING_USERNAME", "admin")
os.environ.setdefault("MLFLOW_TRACKING_PASSWORD", token)
if url:
mlflow.set_tracking_uri(url)


def load_golden(agent: str) -> list[dict]:
"""Load golden baseline scores from JSONL."""
path = f"evals/baselines/{agent}-golden.jsonl"
if not os.path.exists(path):
print(f" No baseline found at {path}")
return []
entries = []
with open(path) as f:
for line in f:
if line.strip():
entries.append(json.loads(line))
return entries


def compute_means(entries: list[dict]) -> dict[str, float]:
"""Compute mean score per scorer from golden entries."""

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Medium] The golden baselines are curated from production traces against different work items with different complexity. compute_means() averages across these heterogeneous traces, then compares against recent traces from yet other work items. A score drop could mean the agent regressed — or that recent work items were harder. Production trace means are useful for observability, metrics, and trend analysis, but they shouldn't be the baseline used for evaluation — too many uncontrolled factors can cause deviation: infrastructure outages (e.g., GitHub downtime), more complex inputs, human activity interlaced with agentic activity, poorly described issues providing bad input. Reliable evaluation baselines need fixed, controlled inputs to isolate agent quality from environmental variance. This PR already has fixtures (fixtures/input.yaml, fixtures/rubric.yaml) — consider using them to build the golden baselines rather than curating from production traces. Fixture-based baselines provide the controlled reference that production traces can't.

sums = {}
counts = {}
for entry in entries:
for scorer_name, value in entry.get("scores", {}).items():
if isinstance(value, (int, float)):
sums[scorer_name] = sums.get(scorer_name, 0) + value
counts[scorer_name] = counts.get(scorer_name, 0) + 1
return {k: sums[k] / counts[k] for k in sums}


def main():
parser = argparse.ArgumentParser(description="Check for quality regressions")
parser.add_argument("--agent", required=True)
parser.add_argument("--days", type=int, default=7)
parser.add_argument("--max-traces", type=int, default=50)
parser.add_argument("--threshold", type=float, default=DEFAULT_THRESHOLD)
parser.add_argument("--strict", action="store_true", help="Exit 1 on any regression")
args = parser.parse_args()

connect()
mlflow.autolog(disable=True)

golden = load_golden(args.agent)
if not golden:
print(f" Skipping {args.agent} — no baseline")
return

golden_means = compute_means(golden)
print(f" Golden baseline ({len(golden)} traces): {golden_means}")

# In production, you would:
# 1. Fetch recent traces via mlflow.search_traces()
# 2. Score them with the same scorers used for golden
# 3. Compare means
#
# Simplified here for the experiment example:
print(f" To complete: fetch recent traces, score, compare against golden means")
print(f" Regression threshold: {args.threshold * 100:.0f}%")

regressions = []
# Example comparison logic:
# for scorer_name, golden_mean in golden_means.items():
# current_mean = current_means.get(scorer_name, 0)
# delta = current_mean - golden_mean
# pct = delta / golden_mean if golden_mean > 0 else 0
# if pct < -args.threshold:
# regressions.append((scorer_name, golden_mean, current_mean, pct))

if regressions:
print(f"\n !! REGRESSION detected:")
for name, gold, curr, pct in regressions:
print(f" {name}: golden={gold:.3f}, current={curr:.3f} ({pct:+.1%})")
if args.strict:
sys.exit(1)
else:
print(f"\n All scorers within threshold. No regression.")


if __name__ == "__main__":
main()
47 changes: 47 additions & 0 deletions agent-eval-mlflow-otel/examples/harness-explore.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Example harness configuration for the explore agent.
# The eval section is the single source of truth for quality gates.
# harness.py reads this at runtime and resolves scorer names to Python functions.

agent: customized/agents/explore.md
model: opus
image: ghcr.io/fullsend-ai/fullsend-sandbox:latest
policy: customized/policies/explore.yaml

skills:
- customized/skills/public-research
- customized/skills/jira-read

pre_script: customized/scripts/pre-explore.sh

validation_loop:
script: scripts/validate-output-schema.sh
max_iterations: 2

post_script: customized/scripts/post-explore.sh

timeout_minutes: 20

eval:
scorers:
mechanical:
- validation_passed
- tool_efficiency
- cost_within_budget
- confidence_coherence
- iteration_count
llm_judge:
model: claude-opus-4-6
criteria:
- name: explore_context_quality
guidelines: >
Is the gathered context relevant, specific, and complete?
Did the agent look in the right places? Is context specific
enough for refinement? Were constraints/risks identified?
- name: reasoning_coherence
guidelines: >
Is reasoning logically coherent and evidence-based?
gates:
min_validation_rate: 0.80
min_quality_score: 3.0
max_cost: 2.00

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Medium] The harness YAML defines min_quality_score: 3.0 (implying a 1–5 scale), but the LLM judge scorers normalize to 0–1 (result["score"] / 5.0). The README's observed ranges confirm the 0–1 scale (e.g., reasoning_coherence mean 0.71). Either the gate compares 3.0 against 0–1 values (nothing passes), or the gate uses raw scores before normalization (inconsistent with the scorer output). Aligning the scales — or documenting which scale the gate expects — would make the example self-consistent.

baseline: evals/baselines/explore-golden.jsonl
99 changes: 99 additions & 0 deletions agent-eval-mlflow-otel/examples/register_prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""Register agent prompts in MLflow Prompts Registry.

Reads agent prompt markdown files and registers them as versioned prompts
with @staging or @production aliases. Uses content-hash dedup to skip
unchanged prompts while still updating the alias.

Usage:
python3 register_prompts.py --alias staging
python3 register_prompts.py --alias production
python3 register_prompts.py --alias staging --agents explore refine

Env:
GIT_COMMIT — Current git commit hash (for metadata)
GIT_BRANCH — Current git branch name
"""
import argparse
import hashlib
import os
from pathlib import Path

import mlflow
from mlflow import MlflowClient

AGENTS_DIR = Path(".fullsend/customized/agents")
PROMPT_PREFIX = "fullsend"


def connect():
url = os.environ.get("MLFLOW_TRACKING_URI", "")
token = os.environ.get("MLFLOW_OTLP_TOKEN", "")
if token:
os.environ.setdefault("MLFLOW_TRACKING_USERNAME", "admin")
os.environ.setdefault("MLFLOW_TRACKING_PASSWORD", token)
if url:
mlflow.set_tracking_uri(url)


def content_hash(text: str) -> str:
return hashlib.sha256(text.encode()).hexdigest()[:12]


def register_prompt(agent: str, alias: str, client: MlflowClient):
"""Register a single agent's prompt in MLflow."""
prompt_path = AGENTS_DIR / f"{agent}.md"
if not prompt_path.exists():
print(f" SKIP {agent} — {prompt_path} not found")
return

content = prompt_path.read_text()
chash = content_hash(content)
prompt_name = f"{PROMPT_PREFIX}-{agent}"

git_commit = os.environ.get("GIT_COMMIT", "unknown")
git_branch = os.environ.get("GIT_BRANCH", "unknown")

tags = {
"git.commit": git_commit,
"git.branch": git_branch,
"content.hash": chash,
"agent": agent,
"source": str(prompt_path),
}

existing = client.search_prompt_versions(name=prompt_name, max_results=1)
if existing:
latest = existing[0]
latest_hash = (latest.tags or {}).get("content.hash", "")
if latest_hash == chash:
print(f" {prompt_name}: content unchanged (hash={chash}), updating alias only")
mlflow.genai.set_prompt_alias(prompt_name, alias, latest.version)
return

version = mlflow.genai.register_prompt(
name=prompt_name,
template=content,
commit_message=f"{alias}: {agent} prompt ({chash})",
tags=tags,
)
print(f" {prompt_name}: registered v{version.version} (hash={chash})")

mlflow.genai.set_prompt_alias(prompt_name, alias, version.version)
print(f" {prompt_name}: alias @{alias} -> v{version.version}")


def main():
parser = argparse.ArgumentParser(description="Register prompts in MLflow")
parser.add_argument("--alias", required=True, choices=["staging", "production"])
parser.add_argument("--agents", nargs="+", default=["explore", "refine", "critique"])
args = parser.parse_args()

connect()
client = MlflowClient()

for agent in args.agents:
register_prompt(agent, args.alias, client)


if __name__ == "__main__":
main()
108 changes: 108 additions & 0 deletions agent-eval-mlflow-otel/examples/run_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Score traces via mlflow.genai.evaluate() and log operational metrics.

Reads traces from MLflow, resolves scorers from harness config, and runs
evaluation. Results appear as Feedbacks on traces (Quality Dashboard) and
as metrics on the evaluation run (Evaluation Runs page).

Usage:
python3 run_eval.py --agent explore --days 7 --max-traces 10
python3 run_eval.py --agent explore --mechanical-only
"""
import argparse
import os
import time

import mlflow
from mlflow import MlflowClient


def connect():
"""Set up MLflow tracking connection."""
url = os.environ.get("MLFLOW_TRACKING_URI", "")
token = os.environ.get("MLFLOW_OTLP_TOKEN", "")
if token:
os.environ.setdefault("MLFLOW_TRACKING_USERNAME", "admin")
os.environ.setdefault("MLFLOW_TRACKING_PASSWORD", token)
if url:
mlflow.set_tracking_uri(url)


def get_traces(agent=None, days=7, max_results=50):
"""Search for traces, optionally filtered by agent and recency."""
filters = []
if agent:
filters.append(f"tags.`fullsend.agent` = '{agent}'")
if days:
import datetime
cutoff = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=days)
filters.append(f"timestamp > {int(cutoff.timestamp() * 1000)}")

filter_str = " AND ".join(filters) if filters else None
return mlflow.search_traces(
locations=["0"],
filter_string=filter_str,
max_results=max_results,
)


def resolve_scorers(agent, mechanical_only=False):
"""Resolve scorer functions for the given agent.

In production, this reads the harness YAML. Here we import directly.
"""
from scorer_mechanical import MECHANICAL_SCORERS

if mechanical_only:
return MECHANICAL_SCORERS

if agent == "explore":
from scorer_llm_judge import EXPLORE_SCORERS
return MECHANICAL_SCORERS + EXPLORE_SCORERS
elif agent == "refine":
from scorer_llm_judge import REFINE_SCORERS
return MECHANICAL_SCORERS + REFINE_SCORERS
elif agent == "critique":
from scorer_llm_judge import CRITIQUE_SCORERS
return MECHANICAL_SCORERS + CRITIQUE_SCORERS
else:
return MECHANICAL_SCORERS


def main():
parser = argparse.ArgumentParser(description="Score traces via MLflow")
parser.add_argument("--agent", required=True, help="Agent name (explore, refine, critique)")
parser.add_argument("--days", type=int, default=7, help="Look-back window in days")
parser.add_argument("--max-traces", type=int, default=50, help="Max traces to score")
parser.add_argument("--mechanical-only", action="store_true", help="Skip LLM judges")
args = parser.parse_args()

connect()
mlflow.autolog(disable=True)

print(f"Fetching traces for {args.agent} (last {args.days} days)...")
traces_df = get_traces(agent=args.agent, days=args.days, max_results=args.max_traces)
print(f" Found {len(traces_df)} traces")

if traces_df.empty:
print(" No traces to score.")
return

scorers = resolve_scorers(args.agent, args.mechanical_only)
print(f" Running {len(scorers)} scorers...")

start = time.time()
result = mlflow.genai.evaluate(data=traces_df, scorers=scorers)
elapsed = time.time() - start

print(f" Evaluation complete in {elapsed:.1f}s")
print(f" Results: {result.metrics}")

mlflow.log_param("agent", args.agent)
mlflow.log_metrics({
"trace_count": len(traces_df),
"latency_ms": int(elapsed * 1000),
})


if __name__ == "__main__":
main()
Loading
Loading