-
Notifications
You must be signed in to change notification settings - Fork 3
experiment: Agent evaluation via MLflow + OpenTelemetry #30
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| __pycache__/ | ||
| *.pyc | ||
| .env | ||
| *.egg-info/ | ||
| results/ | ||
| output/ | ||
| *.jsonl | ||
| !fixtures/*.yaml | ||
| venv/ | ||
| .venv/ |
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,106 @@ | ||
| """Compare recent trace scores against golden baselines. | ||
|
|
||
| Loads a JSONL golden baseline file, computes mean scores per scorer, | ||
| then compares recent traces. Flags regression if any scorer drops | ||
| more than THRESHOLD below the baseline mean. | ||
|
|
||
| Usage: | ||
| python3 check_regression.py --agent explore --strict | ||
| python3 check_regression.py --agent explore --days 14 --threshold 0.15 | ||
| """ | ||
| import argparse | ||
| import json | ||
| import os | ||
| import sys | ||
|
|
||
| import mlflow | ||
|
|
||
| DEFAULT_THRESHOLD = 0.10 | ||
|
|
||
|
|
||
| def connect(): | ||
| url = os.environ.get("MLFLOW_TRACKING_URI", "") | ||
| token = os.environ.get("MLFLOW_OTLP_TOKEN", "") | ||
| if token: | ||
| os.environ.setdefault("MLFLOW_TRACKING_USERNAME", "admin") | ||
| os.environ.setdefault("MLFLOW_TRACKING_PASSWORD", token) | ||
| if url: | ||
| mlflow.set_tracking_uri(url) | ||
|
|
||
|
|
||
| def load_golden(agent: str) -> list[dict]: | ||
| """Load golden baseline scores from JSONL.""" | ||
| path = f"evals/baselines/{agent}-golden.jsonl" | ||
| if not os.path.exists(path): | ||
| print(f" No baseline found at {path}") | ||
| return [] | ||
| entries = [] | ||
| with open(path) as f: | ||
| for line in f: | ||
| if line.strip(): | ||
| entries.append(json.loads(line)) | ||
| return entries | ||
|
|
||
|
|
||
| def compute_means(entries: list[dict]) -> dict[str, float]: | ||
| """Compute mean score per scorer from golden entries.""" | ||
| sums = {} | ||
| counts = {} | ||
| for entry in entries: | ||
| for scorer_name, value in entry.get("scores", {}).items(): | ||
| if isinstance(value, (int, float)): | ||
| sums[scorer_name] = sums.get(scorer_name, 0) + value | ||
| counts[scorer_name] = counts.get(scorer_name, 0) + 1 | ||
| return {k: sums[k] / counts[k] for k in sums} | ||
|
|
||
|
|
||
| def main(): | ||
| parser = argparse.ArgumentParser(description="Check for quality regressions") | ||
| parser.add_argument("--agent", required=True) | ||
| parser.add_argument("--days", type=int, default=7) | ||
| parser.add_argument("--max-traces", type=int, default=50) | ||
| parser.add_argument("--threshold", type=float, default=DEFAULT_THRESHOLD) | ||
| parser.add_argument("--strict", action="store_true", help="Exit 1 on any regression") | ||
| args = parser.parse_args() | ||
|
|
||
| connect() | ||
| mlflow.autolog(disable=True) | ||
|
|
||
| golden = load_golden(args.agent) | ||
| if not golden: | ||
| print(f" Skipping {args.agent} — no baseline") | ||
| return | ||
|
|
||
| golden_means = compute_means(golden) | ||
| print(f" Golden baseline ({len(golden)} traces): {golden_means}") | ||
|
|
||
| # In production, you would: | ||
| # 1. Fetch recent traces via mlflow.search_traces() | ||
| # 2. Score them with the same scorers used for golden | ||
| # 3. Compare means | ||
| # | ||
| # Simplified here for the experiment example: | ||
| print(f" To complete: fetch recent traces, score, compare against golden means") | ||
| print(f" Regression threshold: {args.threshold * 100:.0f}%") | ||
|
|
||
| regressions = [] | ||
| # Example comparison logic: | ||
| # for scorer_name, golden_mean in golden_means.items(): | ||
| # current_mean = current_means.get(scorer_name, 0) | ||
| # delta = current_mean - golden_mean | ||
| # pct = delta / golden_mean if golden_mean > 0 else 0 | ||
| # if pct < -args.threshold: | ||
| # regressions.append((scorer_name, golden_mean, current_mean, pct)) | ||
|
|
||
| if regressions: | ||
| print(f"\n !! REGRESSION detected:") | ||
| for name, gold, curr, pct in regressions: | ||
| print(f" {name}: golden={gold:.3f}, current={curr:.3f} ({pct:+.1%})") | ||
| if args.strict: | ||
| sys.exit(1) | ||
| else: | ||
| print(f"\n All scorers within threshold. No regression.") | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| # Example harness configuration for the explore agent. | ||
| # The eval section is the single source of truth for quality gates. | ||
| # harness.py reads this at runtime and resolves scorer names to Python functions. | ||
|
|
||
| agent: customized/agents/explore.md | ||
| model: opus | ||
| image: ghcr.io/fullsend-ai/fullsend-sandbox:latest | ||
| policy: customized/policies/explore.yaml | ||
|
|
||
| skills: | ||
| - customized/skills/public-research | ||
| - customized/skills/jira-read | ||
|
|
||
| pre_script: customized/scripts/pre-explore.sh | ||
|
|
||
| validation_loop: | ||
| script: scripts/validate-output-schema.sh | ||
| max_iterations: 2 | ||
|
|
||
| post_script: customized/scripts/post-explore.sh | ||
|
|
||
| timeout_minutes: 20 | ||
|
|
||
| eval: | ||
| scorers: | ||
| mechanical: | ||
| - validation_passed | ||
| - tool_efficiency | ||
| - cost_within_budget | ||
| - confidence_coherence | ||
| - iteration_count | ||
| llm_judge: | ||
| model: claude-opus-4-6 | ||
| criteria: | ||
| - name: explore_context_quality | ||
| guidelines: > | ||
| Is the gathered context relevant, specific, and complete? | ||
| Did the agent look in the right places? Is context specific | ||
| enough for refinement? Were constraints/risks identified? | ||
| - name: reasoning_coherence | ||
| guidelines: > | ||
| Is reasoning logically coherent and evidence-based? | ||
| gates: | ||
| min_validation_rate: 0.80 | ||
| min_quality_score: 3.0 | ||
| max_cost: 2.00 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [Medium] The harness YAML defines |
||
| baseline: evals/baselines/explore-golden.jsonl | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,99 @@ | ||
| """Register agent prompts in MLflow Prompts Registry. | ||
|
|
||
| Reads agent prompt markdown files and registers them as versioned prompts | ||
| with @staging or @production aliases. Uses content-hash dedup to skip | ||
| unchanged prompts while still updating the alias. | ||
|
|
||
| Usage: | ||
| python3 register_prompts.py --alias staging | ||
| python3 register_prompts.py --alias production | ||
| python3 register_prompts.py --alias staging --agents explore refine | ||
|
|
||
| Env: | ||
| GIT_COMMIT — Current git commit hash (for metadata) | ||
| GIT_BRANCH — Current git branch name | ||
| """ | ||
| import argparse | ||
| import hashlib | ||
| import os | ||
| from pathlib import Path | ||
|
|
||
| import mlflow | ||
| from mlflow import MlflowClient | ||
|
|
||
| AGENTS_DIR = Path(".fullsend/customized/agents") | ||
| PROMPT_PREFIX = "fullsend" | ||
|
|
||
|
|
||
| def connect(): | ||
| url = os.environ.get("MLFLOW_TRACKING_URI", "") | ||
| token = os.environ.get("MLFLOW_OTLP_TOKEN", "") | ||
| if token: | ||
| os.environ.setdefault("MLFLOW_TRACKING_USERNAME", "admin") | ||
| os.environ.setdefault("MLFLOW_TRACKING_PASSWORD", token) | ||
| if url: | ||
| mlflow.set_tracking_uri(url) | ||
|
|
||
|
|
||
| def content_hash(text: str) -> str: | ||
| return hashlib.sha256(text.encode()).hexdigest()[:12] | ||
|
|
||
|
|
||
| def register_prompt(agent: str, alias: str, client: MlflowClient): | ||
| """Register a single agent's prompt in MLflow.""" | ||
| prompt_path = AGENTS_DIR / f"{agent}.md" | ||
| if not prompt_path.exists(): | ||
| print(f" SKIP {agent} — {prompt_path} not found") | ||
| return | ||
|
|
||
| content = prompt_path.read_text() | ||
| chash = content_hash(content) | ||
| prompt_name = f"{PROMPT_PREFIX}-{agent}" | ||
|
|
||
| git_commit = os.environ.get("GIT_COMMIT", "unknown") | ||
| git_branch = os.environ.get("GIT_BRANCH", "unknown") | ||
|
|
||
| tags = { | ||
| "git.commit": git_commit, | ||
| "git.branch": git_branch, | ||
| "content.hash": chash, | ||
| "agent": agent, | ||
| "source": str(prompt_path), | ||
| } | ||
|
|
||
| existing = client.search_prompt_versions(name=prompt_name, max_results=1) | ||
| if existing: | ||
| latest = existing[0] | ||
| latest_hash = (latest.tags or {}).get("content.hash", "") | ||
| if latest_hash == chash: | ||
| print(f" {prompt_name}: content unchanged (hash={chash}), updating alias only") | ||
| mlflow.genai.set_prompt_alias(prompt_name, alias, latest.version) | ||
| return | ||
|
|
||
| version = mlflow.genai.register_prompt( | ||
| name=prompt_name, | ||
| template=content, | ||
| commit_message=f"{alias}: {agent} prompt ({chash})", | ||
| tags=tags, | ||
| ) | ||
| print(f" {prompt_name}: registered v{version.version} (hash={chash})") | ||
|
|
||
| mlflow.genai.set_prompt_alias(prompt_name, alias, version.version) | ||
| print(f" {prompt_name}: alias @{alias} -> v{version.version}") | ||
|
|
||
|
|
||
| def main(): | ||
| parser = argparse.ArgumentParser(description="Register prompts in MLflow") | ||
| parser.add_argument("--alias", required=True, choices=["staging", "production"]) | ||
| parser.add_argument("--agents", nargs="+", default=["explore", "refine", "critique"]) | ||
| args = parser.parse_args() | ||
|
|
||
| connect() | ||
| client = MlflowClient() | ||
|
|
||
| for agent in args.agents: | ||
| register_prompt(agent, args.alias, client) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,108 @@ | ||
| """Score traces via mlflow.genai.evaluate() and log operational metrics. | ||
|
|
||
| Reads traces from MLflow, resolves scorers from harness config, and runs | ||
| evaluation. Results appear as Feedbacks on traces (Quality Dashboard) and | ||
| as metrics on the evaluation run (Evaluation Runs page). | ||
|
|
||
| Usage: | ||
| python3 run_eval.py --agent explore --days 7 --max-traces 10 | ||
| python3 run_eval.py --agent explore --mechanical-only | ||
| """ | ||
| import argparse | ||
| import os | ||
| import time | ||
|
|
||
| import mlflow | ||
| from mlflow import MlflowClient | ||
|
|
||
|
|
||
| def connect(): | ||
| """Set up MLflow tracking connection.""" | ||
| url = os.environ.get("MLFLOW_TRACKING_URI", "") | ||
| token = os.environ.get("MLFLOW_OTLP_TOKEN", "") | ||
| if token: | ||
| os.environ.setdefault("MLFLOW_TRACKING_USERNAME", "admin") | ||
| os.environ.setdefault("MLFLOW_TRACKING_PASSWORD", token) | ||
| if url: | ||
| mlflow.set_tracking_uri(url) | ||
|
|
||
|
|
||
| def get_traces(agent=None, days=7, max_results=50): | ||
| """Search for traces, optionally filtered by agent and recency.""" | ||
| filters = [] | ||
| if agent: | ||
| filters.append(f"tags.`fullsend.agent` = '{agent}'") | ||
| if days: | ||
| import datetime | ||
| cutoff = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=days) | ||
| filters.append(f"timestamp > {int(cutoff.timestamp() * 1000)}") | ||
|
|
||
| filter_str = " AND ".join(filters) if filters else None | ||
| return mlflow.search_traces( | ||
| locations=["0"], | ||
| filter_string=filter_str, | ||
| max_results=max_results, | ||
| ) | ||
|
|
||
|
|
||
| def resolve_scorers(agent, mechanical_only=False): | ||
| """Resolve scorer functions for the given agent. | ||
|
|
||
| In production, this reads the harness YAML. Here we import directly. | ||
| """ | ||
| from scorer_mechanical import MECHANICAL_SCORERS | ||
|
|
||
| if mechanical_only: | ||
| return MECHANICAL_SCORERS | ||
|
|
||
| if agent == "explore": | ||
| from scorer_llm_judge import EXPLORE_SCORERS | ||
| return MECHANICAL_SCORERS + EXPLORE_SCORERS | ||
| elif agent == "refine": | ||
| from scorer_llm_judge import REFINE_SCORERS | ||
| return MECHANICAL_SCORERS + REFINE_SCORERS | ||
| elif agent == "critique": | ||
| from scorer_llm_judge import CRITIQUE_SCORERS | ||
| return MECHANICAL_SCORERS + CRITIQUE_SCORERS | ||
| else: | ||
| return MECHANICAL_SCORERS | ||
|
|
||
|
|
||
| def main(): | ||
| parser = argparse.ArgumentParser(description="Score traces via MLflow") | ||
| parser.add_argument("--agent", required=True, help="Agent name (explore, refine, critique)") | ||
| parser.add_argument("--days", type=int, default=7, help="Look-back window in days") | ||
| parser.add_argument("--max-traces", type=int, default=50, help="Max traces to score") | ||
| parser.add_argument("--mechanical-only", action="store_true", help="Skip LLM judges") | ||
| args = parser.parse_args() | ||
|
|
||
| connect() | ||
| mlflow.autolog(disable=True) | ||
|
|
||
| print(f"Fetching traces for {args.agent} (last {args.days} days)...") | ||
| traces_df = get_traces(agent=args.agent, days=args.days, max_results=args.max_traces) | ||
| print(f" Found {len(traces_df)} traces") | ||
|
|
||
| if traces_df.empty: | ||
| print(" No traces to score.") | ||
| return | ||
|
|
||
| scorers = resolve_scorers(args.agent, args.mechanical_only) | ||
| print(f" Running {len(scorers)} scorers...") | ||
|
|
||
| start = time.time() | ||
| result = mlflow.genai.evaluate(data=traces_df, scorers=scorers) | ||
| elapsed = time.time() - start | ||
|
|
||
| print(f" Evaluation complete in {elapsed:.1f}s") | ||
| print(f" Results: {result.metrics}") | ||
|
|
||
| mlflow.log_param("agent", args.agent) | ||
| mlflow.log_metrics({ | ||
| "trace_count": len(traces_df), | ||
| "latency_ms": int(elapsed * 1000), | ||
| }) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[Medium] The golden baselines are curated from production traces against different work items with different complexity.
compute_means()averages across these heterogeneous traces, then compares against recent traces from yet other work items. A score drop could mean the agent regressed — or that recent work items were harder. Production trace means are useful for observability, metrics, and trend analysis, but they shouldn't be the baseline used for evaluation — too many uncontrolled factors can cause deviation: infrastructure outages (e.g., GitHub downtime), more complex inputs, human activity interlaced with agentic activity, poorly described issues providing bad input. Reliable evaluation baselines need fixed, controlled inputs to isolate agent quality from environmental variance. This PR already has fixtures (fixtures/input.yaml,fixtures/rubric.yaml) — consider using them to build the golden baselines rather than curating from production traces. Fixture-based baselines provide the controlled reference that production traces can't.