fullsend-ai · ascerra · Jun 9, 2026 · Jun 9, 2026 · maruiz93 · Jun 16, 2026
@@ -0,0 +1,10 @@
+__pycache__/
+*.pyc
+.env
+*.egg-info/
+results/
+output/
+*.jsonl
+!fixtures/*.yaml
+venv/
+.venv/
@@ -0,0 +1,106 @@
+"""Compare recent trace scores against golden baselines.
+
+Loads a JSONL golden baseline file, computes mean scores per scorer,
+then compares recent traces. Flags regression if any scorer drops
+more than THRESHOLD below the baseline mean.
+
+Usage:
+    python3 check_regression.py --agent explore --strict
+    python3 check_regression.py --agent explore --days 14 --threshold 0.15
+"""
+import argparse
+import json
+import os
+import sys
+
+import mlflow
+
+DEFAULT_THRESHOLD = 0.10
+
+
+def connect():
+    url = os.environ.get("MLFLOW_TRACKING_URI", "")
+    token = os.environ.get("MLFLOW_OTLP_TOKEN", "")
+    if token:
+        os.environ.setdefault("MLFLOW_TRACKING_USERNAME", "admin")
+        os.environ.setdefault("MLFLOW_TRACKING_PASSWORD", token)
+    if url:
+        mlflow.set_tracking_uri(url)
+
+
+def load_golden(agent: str) -> list[dict]:
+    """Load golden baseline scores from JSONL."""
+    path = f"evals/baselines/{agent}-golden.jsonl"
+    if not os.path.exists(path):
+        print(f"  No baseline found at {path}")
+        return []
+    entries = []
+    with open(path) as f:
+        for line in f:
+            if line.strip():
+                entries.append(json.loads(line))
+    return entries
+
+
+def compute_means(entries: list[dict]) -> dict[str, float]:
+    """Compute mean score per scorer from golden entries."""
+    sums = {}
+    counts = {}
+    for entry in entries:
+        for scorer_name, value in entry.get("scores", {}).items():
+            if isinstance(value, (int, float)):
+                sums[scorer_name] = sums.get(scorer_name, 0) + value
+                counts[scorer_name] = counts.get(scorer_name, 0) + 1
+    return {k: sums[k] / counts[k] for k in sums}
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Check for quality regressions")
+    parser.add_argument("--agent", required=True)
+    parser.add_argument("--days", type=int, default=7)
+    parser.add_argument("--max-traces", type=int, default=50)
+    parser.add_argument("--threshold", type=float, default=DEFAULT_THRESHOLD)
+    parser.add_argument("--strict", action="store_true", help="Exit 1 on any regression")
+    args = parser.parse_args()
+
+    connect()
+    mlflow.autolog(disable=True)
+
+    golden = load_golden(args.agent)
+    if not golden:
+        print(f"  Skipping {args.agent} — no baseline")
+        return
+
+    golden_means = compute_means(golden)
+    print(f"  Golden baseline ({len(golden)} traces): {golden_means}")
+
+    # In production, you would:
+    # 1. Fetch recent traces via mlflow.search_traces()
+    # 2. Score them with the same scorers used for golden
+    # 3. Compare means
+    #
+    # Simplified here for the experiment example:
+    print(f"  To complete: fetch recent traces, score, compare against golden means")
+    print(f"  Regression threshold: {args.threshold * 100:.0f}%")
+
+    regressions = []
+    # Example comparison logic:
+    # for scorer_name, golden_mean in golden_means.items():
+    #     current_mean = current_means.get(scorer_name, 0)
+    #     delta = current_mean - golden_mean
+    #     pct = delta / golden_mean if golden_mean > 0 else 0
+    #     if pct < -args.threshold:
+    #         regressions.append((scorer_name, golden_mean, current_mean, pct))
+
+    if regressions:
+        print(f"\n  !! REGRESSION detected:")
+        for name, gold, curr, pct in regressions:
+            print(f"     {name}: golden={gold:.3f}, current={curr:.3f} ({pct:+.1%})")
+        if args.strict:
+            sys.exit(1)
+    else:
+        print(f"\n  All scorers within threshold. No regression.")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,47 @@
+# Example harness configuration for the explore agent.
+# The eval section is the single source of truth for quality gates.
+# harness.py reads this at runtime and resolves scorer names to Python functions.
+
+agent: customized/agents/explore.md
+model: opus
+image: ghcr.io/fullsend-ai/fullsend-sandbox:latest
+policy: customized/policies/explore.yaml
+
+skills:
+  - customized/skills/public-research
+  - customized/skills/jira-read
+
+pre_script: customized/scripts/pre-explore.sh
+
+validation_loop:
+  script: scripts/validate-output-schema.sh
+  max_iterations: 2
+
+post_script: customized/scripts/post-explore.sh
+
+timeout_minutes: 20
+
+eval:
+  scorers:
+    mechanical:
+      - validation_passed
+      - tool_efficiency
+      - cost_within_budget
+      - confidence_coherence
+      - iteration_count
+    llm_judge:
+      model: claude-opus-4-6
+      criteria:
+        - name: explore_context_quality
+          guidelines: >
+            Is the gathered context relevant, specific, and complete?
+            Did the agent look in the right places? Is context specific
+            enough for refinement? Were constraints/risks identified?
+        - name: reasoning_coherence
+          guidelines: >
+            Is reasoning logically coherent and evidence-based?
+  gates:
+    min_validation_rate: 0.80
+    min_quality_score: 3.0
+    max_cost: 2.00
+  baseline: evals/baselines/explore-golden.jsonl
@@ -0,0 +1,99 @@
+"""Register agent prompts in MLflow Prompts Registry.
+
+Reads agent prompt markdown files and registers them as versioned prompts
+with @staging or @production aliases. Uses content-hash dedup to skip
+unchanged prompts while still updating the alias.
+
+Usage:
+    python3 register_prompts.py --alias staging
+    python3 register_prompts.py --alias production
+    python3 register_prompts.py --alias staging --agents explore refine
+
+Env:
+    GIT_COMMIT  — Current git commit hash (for metadata)
+    GIT_BRANCH  — Current git branch name
+"""
+import argparse
+import hashlib
+import os
+from pathlib import Path
+
+import mlflow
+from mlflow import MlflowClient
+
+AGENTS_DIR = Path(".fullsend/customized/agents")
+PROMPT_PREFIX = "fullsend"
+
+
+def connect():
+    url = os.environ.get("MLFLOW_TRACKING_URI", "")
+    token = os.environ.get("MLFLOW_OTLP_TOKEN", "")
+    if token:
+        os.environ.setdefault("MLFLOW_TRACKING_USERNAME", "admin")
+        os.environ.setdefault("MLFLOW_TRACKING_PASSWORD", token)
+    if url:
+        mlflow.set_tracking_uri(url)
+
+
+def content_hash(text: str) -> str:
+    return hashlib.sha256(text.encode()).hexdigest()[:12]
+
+
+def register_prompt(agent: str, alias: str, client: MlflowClient):
+    """Register a single agent's prompt in MLflow."""
+    prompt_path = AGENTS_DIR / f"{agent}.md"
+    if not prompt_path.exists():
+        print(f"  SKIP {agent} — {prompt_path} not found")
+        return
+
+    content = prompt_path.read_text()
+    chash = content_hash(content)
+    prompt_name = f"{PROMPT_PREFIX}-{agent}"
+
+    git_commit = os.environ.get("GIT_COMMIT", "unknown")
+    git_branch = os.environ.get("GIT_BRANCH", "unknown")
+
+    tags = {
+        "git.commit": git_commit,
+        "git.branch": git_branch,
+        "content.hash": chash,
+        "agent": agent,
+        "source": str(prompt_path),
+    }
+
+    existing = client.search_prompt_versions(name=prompt_name, max_results=1)
+    if existing:
+        latest = existing[0]
+        latest_hash = (latest.tags or {}).get("content.hash", "")
+        if latest_hash == chash:
+            print(f"  {prompt_name}: content unchanged (hash={chash}), updating alias only")
+            mlflow.genai.set_prompt_alias(prompt_name, alias, latest.version)
+            return
+
+    version = mlflow.genai.register_prompt(
+        name=prompt_name,
+        template=content,
+        commit_message=f"{alias}: {agent} prompt ({chash})",
+        tags=tags,
+    )
+    print(f"  {prompt_name}: registered v{version.version} (hash={chash})")
+
+    mlflow.genai.set_prompt_alias(prompt_name, alias, version.version)
+    print(f"  {prompt_name}: alias @{alias} -> v{version.version}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Register prompts in MLflow")
+    parser.add_argument("--alias", required=True, choices=["staging", "production"])
+    parser.add_argument("--agents", nargs="+", default=["explore", "refine", "critique"])
+    args = parser.parse_args()
+
+    connect()
+    client = MlflowClient()
+
+    for agent in args.agents:
+        register_prompt(agent, args.alias, client)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,108 @@
+"""Score traces via mlflow.genai.evaluate() and log operational metrics.
+
+Reads traces from MLflow, resolves scorers from harness config, and runs
+evaluation. Results appear as Feedbacks on traces (Quality Dashboard) and
+as metrics on the evaluation run (Evaluation Runs page).
+
+Usage:
+    python3 run_eval.py --agent explore --days 7 --max-traces 10
+    python3 run_eval.py --agent explore --mechanical-only
+"""
+import argparse
+import os
+import time
+
+import mlflow
+from mlflow import MlflowClient
+
+
+def connect():
+    """Set up MLflow tracking connection."""
+    url = os.environ.get("MLFLOW_TRACKING_URI", "")
+    token = os.environ.get("MLFLOW_OTLP_TOKEN", "")
+    if token:
+        os.environ.setdefault("MLFLOW_TRACKING_USERNAME", "admin")
+        os.environ.setdefault("MLFLOW_TRACKING_PASSWORD", token)
+    if url:
+        mlflow.set_tracking_uri(url)
+
+
+def get_traces(agent=None, days=7, max_results=50):
+    """Search for traces, optionally filtered by agent and recency."""
+    filters = []
+    if agent:
+        filters.append(f"tags.`fullsend.agent` = '{agent}'")
+    if days:
+        import datetime
+        cutoff = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=days)
+        filters.append(f"timestamp > {int(cutoff.timestamp() * 1000)}")
+
+    filter_str = " AND ".join(filters) if filters else None
+    return mlflow.search_traces(
+        locations=["0"],
+        filter_string=filter_str,
+        max_results=max_results,
+    )
+
+
+def resolve_scorers(agent, mechanical_only=False):
+    """Resolve scorer functions for the given agent.
+
+    In production, this reads the harness YAML. Here we import directly.
+    """
+    from scorer_mechanical import MECHANICAL_SCORERS
+
+    if mechanical_only:
+        return MECHANICAL_SCORERS
+
+    if agent == "explore":
+        from scorer_llm_judge import EXPLORE_SCORERS
+        return MECHANICAL_SCORERS + EXPLORE_SCORERS
+    elif agent == "refine":
+        from scorer_llm_judge import REFINE_SCORERS
+        return MECHANICAL_SCORERS + REFINE_SCORERS
+    elif agent == "critique":
+        from scorer_llm_judge import CRITIQUE_SCORERS
+        return MECHANICAL_SCORERS + CRITIQUE_SCORERS
+    else:
+        return MECHANICAL_SCORERS
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Score traces via MLflow")
+    parser.add_argument("--agent", required=True, help="Agent name (explore, refine, critique)")
+    parser.add_argument("--days", type=int, default=7, help="Look-back window in days")
+    parser.add_argument("--max-traces", type=int, default=50, help="Max traces to score")
+    parser.add_argument("--mechanical-only", action="store_true", help="Skip LLM judges")
+    args = parser.parse_args()
+
+    connect()
+    mlflow.autolog(disable=True)
+
+    print(f"Fetching traces for {args.agent} (last {args.days} days)...")
+    traces_df = get_traces(agent=args.agent, days=args.days, max_results=args.max_traces)
+    print(f"  Found {len(traces_df)} traces")
+
+    if traces_df.empty:
+        print("  No traces to score.")
+        return
+
+    scorers = resolve_scorers(args.agent, args.mechanical_only)
+    print(f"  Running {len(scorers)} scorers...")
+
+    start = time.time()
+    result = mlflow.genai.evaluate(data=traces_df, scorers=scorers)
+    elapsed = time.time() - start
+
+    print(f"  Evaluation complete in {elapsed:.1f}s")
+    print(f"  Results: {result.metrics}")
+
+    mlflow.log_param("agent", args.agent)
+    mlflow.log_metrics({
+        "trace_count": len(traces_df),
+        "latency_ms": int(elapsed * 1000),
+    })
+
+
+if __name__ == "__main__":
+    main()