Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion cmd/agenttrace/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ func main() {
failUnderHealth := flag.Int("fail-under-health", 0, "Exit non-zero when overview average health is below this score")
failOnCritical := flag.Bool("fail-on-critical", false, "Exit non-zero when overview contains critical sessions")
maxToolFailRate := flag.Float64("max-tool-fail-rate", -1, "Exit non-zero when overview tool failure rate exceeds this percent")
baseline := flag.String("baseline", "", "Compare --overview -f json against a local baseline JSON report")
baselineMaxDurationDeltaPct := flag.Float64("baseline-max-duration-delta-pct", 0, "Allowed overview duration increase percent versus --baseline")
baselineMaxCostDeltaPct := flag.Float64("baseline-max-cost-delta-pct", 0, "Allowed overview cost increase percent versus --baseline")
baselineMaxTokenDeltaPct := flag.Float64("baseline-max-token-delta-pct", 0, "Allowed overview token increase percent versus --baseline")
lang := flag.String("lang", "en", "Language for report output: en, zh")
flag.Parse()

Expand Down Expand Up @@ -127,7 +131,7 @@ func main() {
return
}

hasAction := path != "" || *latest || *compare || *overview || *wasteFlag
hasAction := path != "" || *latest || *compare || *overview || *wasteFlag || *baseline != ""

if !hasAction {
// Launch TUI
Expand All @@ -139,6 +143,10 @@ func main() {
}
return
}
if *baseline != "" && !*overview {
fmt.Fprintln(os.Stderr, "--baseline requires --overview -f json")
os.Exit(1)
}

// Overview mode
if *overview {
Expand All @@ -161,9 +169,25 @@ func main() {
}
ov := engine.ComputeOverview(sessions)
out := engine.ReportOverview(ov, sessions)
if *baseline != "" && strings.ToLower(*format) != "json" {
fmt.Fprintln(os.Stderr, "--baseline requires --overview -f json")
os.Exit(1)
}
switch *format {
case "json":
out = engine.ReportOverviewJSON(ov, sessions)
if *baseline != "" {
var baselineErr error
out, baselineErr = engine.AddBaselineComparison(out, *baseline, engine.BaselineThresholds{
MaxDurationDeltaPct: *baselineMaxDurationDeltaPct,
MaxCostDeltaPct: *baselineMaxCostDeltaPct,
MaxTokenDeltaPct: *baselineMaxTokenDeltaPct,
})
if baselineErr != nil {
fmt.Fprintf(os.Stderr, i18n.T("cli_error"), baselineErr)
os.Exit(1)
}
}
case "markdown", "md":
out = engine.ReportOverviewMarkdown(ov, sessions)
case "html":
Expand Down
27 changes: 27 additions & 0 deletions docs/ci-integration.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,23 @@ agenttrace --overview -f json \
-o agenttrace-overview.json
```

To compare a current run with a local CI baseline artifact, keep a previous
`--overview -f json` report and pass it back with explicit delta thresholds:

```bash
agenttrace --overview -f json \
--baseline agenttrace-baseline.json \
--baseline-max-duration-delta-pct 10 \
--baseline-max-cost-delta-pct 15 \
--baseline-max-token-delta-pct 20 \
-o agenttrace-overview.json
```

The JSON report includes `baseline_comparison` with deterministic fields for
duration, cost, token deltas, new failure families, broader tool/file surfaces,
and new high-authority tool use. Baseline reports must be local JSON artifacts
from the same agenttrace version.

## GitHub Actions

```yaml
Expand All @@ -69,6 +86,15 @@ jobs:
--fail-on-critical \
--max-tool-fail-rate 15 \
-o agenttrace-overview.json
- name: Compare against local baseline
if: hashFiles('agenttrace-baseline.json') != ''
run: |
agenttrace --overview -f json \
--baseline agenttrace-baseline.json \
--baseline-max-duration-delta-pct 10 \
--baseline-max-cost-delta-pct 15 \
--baseline-max-token-delta-pct 20 \
-o agenttrace-overview.json
- name: Write Markdown summary
if: always()
run: |
Expand Down Expand Up @@ -104,6 +130,7 @@ scripts/ci/check-pages-artifact.sh site
These checks cover:

- demo JSON, Markdown, HTML, and doctor smoke output
- local baseline comparison JSON contract and deterministic fields
- `-o` stdout/stderr behavior and failing gate exit code `2`
- repeated demo latest/overview JSON determinism
- report cost-label and version metadata consistency
Expand Down
182 changes: 182 additions & 0 deletions internal/engine/baseline.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
package engine

import (
"encoding/json"
"errors"
"fmt"
"os"
)

type BaselineThresholds struct {
MaxDurationDeltaPct float64 `json:"max_duration_delta_pct"`
MaxCostDeltaPct float64 `json:"max_cost_delta_pct"`
MaxTokenDeltaPct float64 `json:"max_token_delta_pct"`
}

type BaselineComparison struct {
BaselinePath string `json:"baseline_path"`
Thresholds BaselineThresholds `json:"thresholds"`
Current baselineSnapshot `json:"current"`
Baseline baselineSnapshot `json:"baseline"`
DurationDeltaPct float64 `json:"duration_delta_pct"`
CostDeltaPct float64 `json:"cost_delta_pct"`
TokenDeltaPct float64 `json:"token_delta_pct"`
SlowerThanBaseline bool `json:"slower_than_baseline"`
CostAboveThreshold bool `json:"cost_above_threshold"`
TokensAboveThreshold bool `json:"tokens_above_threshold"`
NewFailureFamilies []string `json:"new_failure_families"`
BroaderToolSurface bool `json:"broader_tool_surface"`
NewTools []string `json:"new_tools"`
BroaderFileSurface bool `json:"broader_file_surface"`
NewFiles []string `json:"new_files"`
NewHighAuthorityToolUse []string `json:"new_high_authority_tool_use"`
}

type baselineSnapshot struct {
DurationSeconds float64 `json:"duration_seconds"`
Cost float64 `json:"cost"`
Tokens int `json:"tokens"`
FailureFamilies []string `json:"failure_families"`
Tools []string `json:"tools"`
Files []string `json:"files"`
HighAuthorityTools []string `json:"high_authority_tools"`
}

type overviewBaselineReport struct {
Version string `json:"version"`
Summary struct {
TotalDurationSeconds float64 `json:"total_duration_seconds"`
TotalCost float64 `json:"total_cost"`
TotalTokens int `json:"total_tokens"`
} `json:"summary"`
FailureFamilies []string `json:"failure_families"`
Surfaces struct {
Tools []string `json:"tools"`
Files []string `json:"files"`
HighAuthorityTools []string `json:"high_authority_tools"`
} `json:"surfaces"`
}

func AddBaselineComparison(currentReport string, baselinePath string, thresholds BaselineThresholds) (string, error) {
if baselinePath == "" {
return "", errors.New("missing baseline report path")
}
current, err := decodeOverviewBaselineReport([]byte(currentReport), "current report")
if err != nil {
return "", err
}
baselineData, err := os.ReadFile(baselinePath)
if err != nil {
if os.IsNotExist(err) {
return "", fmt.Errorf("missing baseline report: %s", baselinePath)
}
return "", fmt.Errorf("read baseline report %s: %w", baselinePath, err)
}
baseline, err := decodeOverviewBaselineReport(baselineData, "baseline report")
if err != nil {
return "", err
}
if baseline.Version != Version {
return "", fmt.Errorf("baseline report version %q is incompatible with current version %q", baseline.Version, Version)
}

comparison := compareOverviewBaseline(current, baseline, baselinePath, thresholds)
var payload map[string]interface{}
if err := json.Unmarshal([]byte(currentReport), &payload); err != nil {
return "", fmt.Errorf("decode current report payload: %w", err)
}
payload["baseline_comparison"] = comparison
out, err := json.MarshalIndent(payload, "", " ")
if err != nil {
return "", fmt.Errorf("encode baseline comparison report: %w", err)
}
return string(out), nil
}

func decodeOverviewBaselineReport(data []byte, label string) (overviewBaselineReport, error) {
var report overviewBaselineReport
if err := json.Unmarshal(data, &report); err != nil {
return report, fmt.Errorf("decode %s JSON: %w", label, err)
}
if report.Version == "" {
return report, fmt.Errorf("%s is missing version", label)
}
if report.Summary.TotalTokens == 0 && report.Summary.TotalCost == 0 && report.Summary.TotalDurationSeconds == 0 {
return report, fmt.Errorf("%s is missing overview summary totals", label)
}
return report, nil
}

func compareOverviewBaseline(current, baseline overviewBaselineReport, baselinePath string, thresholds BaselineThresholds) BaselineComparison {
currentSnapshot := baselineSnapshotFromReport(current)
baselineSnapshot := baselineSnapshotFromReport(baseline)
durationDelta := deltaPct(currentSnapshot.DurationSeconds, baselineSnapshot.DurationSeconds)
costDelta := deltaPct(currentSnapshot.Cost, baselineSnapshot.Cost)
tokenDelta := deltaPct(float64(currentSnapshot.Tokens), float64(baselineSnapshot.Tokens))
newTools := setDiff(currentSnapshot.Tools, baselineSnapshot.Tools)
newFiles := setDiff(currentSnapshot.Files, baselineSnapshot.Files)
return BaselineComparison{
BaselinePath: baselinePath,
Thresholds: thresholds,
Current: currentSnapshot,
Baseline: baselineSnapshot,
DurationDeltaPct: durationDelta,
CostDeltaPct: costDelta,
TokenDeltaPct: tokenDelta,
SlowerThanBaseline: durationDelta > thresholds.MaxDurationDeltaPct,
CostAboveThreshold: costDelta > thresholds.MaxCostDeltaPct,
TokensAboveThreshold: tokenDelta > thresholds.MaxTokenDeltaPct,
NewFailureFamilies: setDiff(currentSnapshot.FailureFamilies, baselineSnapshot.FailureFamilies),
BroaderToolSurface: len(newTools) > 0,
NewTools: newTools,
BroaderFileSurface: len(newFiles) > 0,
NewFiles: newFiles,
NewHighAuthorityToolUse: setDiff(currentSnapshot.HighAuthorityTools, baselineSnapshot.HighAuthorityTools),
}
}

func baselineSnapshotFromReport(report overviewBaselineReport) baselineSnapshot {
return baselineSnapshot{
DurationSeconds: round4(report.Summary.TotalDurationSeconds),
Cost: round4(report.Summary.TotalCost),
Tokens: report.Summary.TotalTokens,
FailureFamilies: sortedStringSet(report.FailureFamilies),
Tools: sortedStringSet(report.Surfaces.Tools),
Files: sortedStringSet(report.Surfaces.Files),
HighAuthorityTools: sortedStringSet(report.Surfaces.HighAuthorityTools),
}
}

func deltaPct(current, baseline float64) float64 {
if baseline == 0 {
if current > 0 {
return 100
}
return 0
}
return round4((current - baseline) / baseline * 100)
}

func setDiff(current, baseline []string) []string {
seen := make(map[string]struct{}, len(baseline))
for _, item := range baseline {
seen[item] = struct{}{}
}
var out []string
for _, item := range sortedStringSet(current) {
if _, ok := seen[item]; !ok {
out = append(out, item)
}
}
return out
}

func sortedStringSet(items []string) []string {
seen := make(map[string]struct{}, len(items))
for _, item := range items {
if item != "" {
seen[item] = struct{}{}
}
}
return sortedReportKeys(seen)
}
63 changes: 63 additions & 0 deletions internal/engine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ type Metrics struct {
ToolCallsOK int
ToolCallsFail int
ToolUsage map[string]int
FileUsage map[string]int
ReasoningBlocks int
ReasoningChars int
ReasoningLens []int
Expand Down Expand Up @@ -1882,6 +1883,7 @@ func Analyze(events []Event, model string) Metrics {
m := Metrics{
ModelUsed: model,
ToolUsage: make(map[string]int),
FileUsage: make(map[string]int),
}

pricing := LookupPrice(model)
Expand Down Expand Up @@ -1945,6 +1947,9 @@ func Analyze(events []Event, model string) Metrics {
name = "unknown"
}
m.ToolUsage[name]++
for _, file := range extractToolCallFiles(tc.Args) {
m.FileUsage[file]++
}
}

case "tool":
Expand Down Expand Up @@ -2004,6 +2009,64 @@ func Analyze(events []Event, model string) Metrics {
return m
}

func extractToolCallFiles(args string) []string {
if strings.TrimSpace(args) == "" {
return nil
}
var raw interface{}
if err := json.Unmarshal([]byte(args), &raw); err != nil {
return nil
}
seen := make(map[string]struct{})
collectToolCallFiles(raw, "", seen)
files := make([]string, 0, len(seen))
for file := range seen {
files = append(files, file)
}
sort.Strings(files)
return files
}

func collectToolCallFiles(raw interface{}, key string, seen map[string]struct{}) {
switch v := raw.(type) {
case map[string]interface{}:
for k, child := range v {
collectToolCallFiles(child, strings.ToLower(k), seen)
}
case []interface{}:
for _, child := range v {
collectToolCallFiles(child, key, seen)
}
case string:
if !isFileSurfaceKey(key) {
return
}
file := normalizeToolCallFile(v)
if file != "" {
seen[file] = struct{}{}
}
}
}

func isFileSurfaceKey(key string) bool {
key = strings.ReplaceAll(strings.ToLower(key), "-", "_")
switch key {
case "path", "file", "files", "filename", "file_name", "filepath", "file_path", "target", "target_file", "uri":
return true
default:
return strings.Contains(key, "file") || strings.Contains(key, "path")
}
}

func normalizeToolCallFile(value string) string {
value = strings.TrimSpace(value)
if value == "" || strings.Contains(value, "\n") || strings.HasPrefix(value, "http://") || strings.HasPrefix(value, "https://") {
return ""
}
value = strings.TrimPrefix(value, "file://")
return filepath.ToSlash(filepath.Clean(value))
}

// ═══════════════════════════════════════════════════════════════
// ANOMALY DETECTION
// ═══════════════════════════════════════════════════════════════
Expand Down
Loading