From 7e9c38214e767ba2fa73354b08d1edc65d312b5e Mon Sep 17 00:00:00 2001 From: bbbbzzzzcc-afk Date: Wed, 1 Jul 2026 10:38:45 +0800 Subject: [PATCH 1/4] feat(skills): add flaky-test-judge --- skills/flaky-test-judge/SKILL.md | 135 ++++++++++++ skills/flaky-test-judge/X.yaml | 202 ++++++++++++++++++ .../fixtures/missing-run-history.yaml | 20 ++ .../fixtures/quarantine-justified.yaml | 23 ++ 4 files changed, 380 insertions(+) create mode 100644 skills/flaky-test-judge/SKILL.md create mode 100644 skills/flaky-test-judge/X.yaml create mode 100644 skills/flaky-test-judge/fixtures/missing-run-history.yaml create mode 100644 skills/flaky-test-judge/fixtures/quarantine-justified.yaml diff --git a/skills/flaky-test-judge/SKILL.md b/skills/flaky-test-judge/SKILL.md new file mode 100644 index 000000000..27be50a61 --- /dev/null +++ b/skills/flaky-test-judge/SKILL.md @@ -0,0 +1,135 @@ +--- +name: flaky-test-judge +description: Judge supplied test-run history and emit a bounded flaky-test disposition without mutating a repository. +runx: + category: ops +--- + +# Flaky Test Judge + +Decide whether one test should be quarantined temporarily, treated as +environmental noise, fixed as a real bug, or escalated for human review. + +The skill reads only the supplied run history, test metadata, and release +policy. It emits a `runx.flaky.test_triage.v1` handoff packet. It never edits a +test, changes CI configuration, opens an issue, or starts a pull request. + +## Inputs + +- `test_run_history`: ordered `runs` containing `status`, `duration`, and + `logs`, plus the declared `sample_size`, `window_start`, and `window_end`. +- `test_metadata`: `test_path`, `suite`, and optional `tags`. +- `release_policy`: `flake_threshold_pct`, `min_sample_size`, and + `max_quarantine_days`. + +## Evidence calculation + +1. Count only supplied runs. +2. Verify `sample_size` equals the number of supplied runs. +3. Compute pass rate as `passing runs / total runs * 100`. +4. Count failure modes only when their evidence is visible in supplied logs. +5. Cite run indexes or exact log fragments for every failure-mode claim. + +Do not infer hidden retries, infrastructure incidents, or product defects. + +## Disposition rules + +- **Stop:** no run history, a sample-size mismatch, or fewer runs than + `min_sample_size`. Use a `missing-evidence` reason and emit no quarantine + packet. +- **Keep enabled:** pass rate is at or above `flake_threshold_pct`. Refuse to + quarantine. +- **Human review:** evidence is near the threshold, failure modes conflict, or + logs do not distinguish environmental noise from a real break. +- **Fix now:** supplied evidence consistently identifies a reproducible product + or test defect rather than intermittent environmental behavior. +- **Ignore environmental noise:** failures are clearly external and policy + allows no repository action. +- **Quarantine:** pass rate is below the policy threshold, the sample is + sufficient, intermittent failure evidence is explicit, and a bounded + temporary exclusion is justified. + +Confidence must reflect the supplied evidence. A classification cannot exceed +the strength of its cited logs. + +## Quarantine packet + +When quarantine is justified, include: + +```yaml +schema: runx.flaky.test_triage.v1 +disposition: + decision: quarantine + confidence: 0.96 + reason: 65% pass rate over 20 runs; six of seven failures are explicit timeouts. +quarantine: + test_path: tests/integration/test_checkout.py::test_retries_expired_session + duration_days: 7 + exclusion_marker: '@pytest.mark.skip(reason="quarantined: tracked flaky timeout")' + fix_template: + thread_title: Fix flaky checkout retry timeout + thread_body: > + Temporarily exclude the named test, preserve the cited run evidence, and + remove the exclusion when the timeout cause is fixed. + target_repo: example/project + base: main +escalation: + required: false + lane: human_review + reason: "" +dispatch_target: issue-to-pr +``` + +`duration_days` must be positive and must never exceed +`release_policy.max_quarantine_days`. The packet names `issue-to-pr` only as a +downstream dispatch target. A separate governed run must map the fix template +to `thread_title`, `thread_body`, `target_repo`, and `base`. + +The downstream run drafts the change. A human merge gate is the only path to a +live test disable. + +## Refusal and escalation + +- Refuse quarantine when the pass rate is at or above the threshold. +- Stop when no runs are supplied or the sample is below policy minimum. +- Escalate near-threshold or conflicting evidence. +- Never invent a failure mode absent from the logs. +- Never exceed the quarantine duration ceiling. +- Never mutate a repository or consume the handoff as an effect. +- Emit no mint, `AttenuationRequest`, data-store operation, or + `operational_proposal.v1`. + +## Evidence + +Record: + +- disposition decision, confidence, and reason; +- pass rate, run count, and sample window; +- failure-mode counts and cited run evidence; +- quarantine duration and exclusion marker when present; +- refusal or escalation reason; +- dispatch target; +- harness case names; +- sealed receipt id. + +## Local verification + +```bash +runx --version +runx harness ./skills/flaky-test-judge +``` + +The inline harness contains exactly two cases: + +- `quarantine_justified`: 13 passes and 7 failures over 20 runs, including 6 + explicit timeouts, produce a bounded quarantine packet. +- `missing_run_history`: no runs produce a sealed missing-evidence stop with no + quarantine packet. + +After publishing: + +```bash +runx add /flaky-test-judge@0.1.0 +runx skill /flaky-test-judge@0.1.0 --json +runx verify --receipt --json +``` diff --git a/skills/flaky-test-judge/X.yaml b/skills/flaky-test-judge/X.yaml new file mode 100644 index 000000000..ade3b1eae --- /dev/null +++ b/skills/flaky-test-judge/X.yaml @@ -0,0 +1,202 @@ +skill: flaky-test-judge +version: "0.1.0" + +catalog: + kind: skill + audience: operator + visibility: public + role: canonical + +harness: + cases: + - name: quarantine_justified + inputs: + test_run_history: + sample_size: 20 + window_start: "2026-06-23T00:00:00Z" + window_end: "2026-06-30T00:00:00Z" + runs: + - { status: passed, duration: 1.1, logs: "passed" } + - { status: passed, duration: 1.0, logs: "passed" } + - { status: failed, duration: 30.0, logs: "TimeoutError: checkout response exceeded 30s" } + - { status: passed, duration: 1.2, logs: "passed" } + - { status: passed, duration: 1.1, logs: "passed" } + - { status: failed, duration: 30.0, logs: "TimeoutError: checkout response exceeded 30s" } + - { status: passed, duration: 1.0, logs: "passed" } + - { status: passed, duration: 1.3, logs: "passed" } + - { status: failed, duration: 30.0, logs: "TimeoutError: checkout response exceeded 30s" } + - { status: passed, duration: 1.1, logs: "passed" } + - { status: passed, duration: 1.0, logs: "passed" } + - { status: failed, duration: 30.0, logs: "TimeoutError: checkout response exceeded 30s" } + - { status: passed, duration: 1.2, logs: "passed" } + - { status: passed, duration: 1.1, logs: "passed" } + - { status: failed, duration: 30.0, logs: "TimeoutError: checkout response exceeded 30s" } + - { status: passed, duration: 1.0, logs: "passed" } + - { status: passed, duration: 1.3, logs: "passed" } + - { status: failed, duration: 30.0, logs: "TimeoutError: checkout response exceeded 30s" } + - { status: passed, duration: 1.1, logs: "passed" } + - { status: failed, duration: 2.4, logs: "AssertionError: retry count was 2, expected 3" } + test_metadata: + test_path: tests/integration/test_checkout.py::test_retries_expired_session + suite: integration + tags: + - release-blocking + - network + target_repo: example/project + base: main + release_policy: + flake_threshold_pct: 70 + min_sample_size: 20 + max_quarantine_days: 7 + caller: + answers: + agent_task.flaky-test-judge.output: + schema: runx.flaky.test_triage.v1 + disposition: + decision: quarantine + confidence: 0.96 + reason: 65% pass rate over 20 supplied runs is below the 70% threshold; 6 of 7 failures contain an explicit timeout. + quarantine: + test_path: tests/integration/test_checkout.py::test_retries_expired_session + duration_days: 7 + exclusion_marker: '@pytest.mark.skip(reason="quarantined: tracked flaky timeout")' + fix_template: + thread_title: Fix flaky checkout retry timeout + thread_body: Temporarily exclude the named test using the supplied marker, preserve the cited timeout evidence, fix the intermittent checkout timeout, and remove the exclusion before closing the issue. + target_repo: example/project + base: main + escalation: + required: false + lane: human_review + reason: "" + dispatch_target: issue-to-pr + metrics: + pass_rate_pct: 65 + run_count: 20 + passing_runs: 13 + failing_runs: 7 + window_start: "2026-06-23T00:00:00Z" + window_end: "2026-06-30T00:00:00Z" + failure_modes: + timeout: 6 + assertion: 1 + evidence_refs: + - runs[2,5,8,11,14,17].logs: TimeoutError + - runs[19].logs: AssertionError + - release_policy.flake_threshold_pct: 70 + - release_policy.max_quarantine_days: 7 + stop_conditions: [] + expect: + status: sealed + receipt: + schema: runx.receipt.v1 + + - name: missing_run_history + inputs: + test_run_history: + sample_size: 0 + window_start: "2026-06-30T00:00:00Z" + window_end: "2026-06-30T00:00:00Z" + runs: [] + test_metadata: + test_path: tests/integration/test_checkout.py::test_retries_expired_session + suite: integration + tags: + - release-blocking + target_repo: example/project + base: main + release_policy: + flake_threshold_pct: 70 + min_sample_size: 20 + max_quarantine_days: 7 + caller: + answers: + agent_task.flaky-test-judge.output: + schema: runx.flaky.test_triage.v1 + disposition: + decision: stop + confidence: 1 + reason: "missing-evidence: no run history was supplied, so no pass rate or failure mode can be computed." + escalation: + required: true + lane: human_review + reason: Supply at least 20 test runs before judging quarantine. + dispatch_target: "" + metrics: + pass_rate_pct: null + run_count: 0 + passing_runs: 0 + failing_runs: 0 + window_start: "2026-06-30T00:00:00Z" + window_end: "2026-06-30T00:00:00Z" + failure_modes: {} + evidence_refs: + - test_run_history.runs is empty + - release_policy.min_sample_size: 20 + stop_conditions: + - missing_run_history + expect: + status: sealed + receipt: + schema: runx.receipt.v1 + +runners: + judge: + default: true + type: agent-task + agent: reviewer + task: flaky-test-judge + runx: + category: ops + post_run: + reflect: never + outputs: + schema: string + disposition: object + quarantine: object + escalation: object + dispatch_target: string + metrics: object + evidence_refs: array + stop_conditions: array + artifacts: + wrap_as: flaky_test_triage + packet: runx.flaky.test_triage.v1 + instructions: > + Judge one test only from the supplied test_run_history, test_metadata, + and release_policy. Verify sample_size equals the number of runs. Compute + pass_rate_pct from supplied statuses and count failure modes only when + their evidence is visible in supplied logs. Cite run indexes or exact log + fragments. If no history is supplied, the sample-size declaration is + inconsistent, or the run count is below min_sample_size, emit a sealed + disposition with decision=stop, a reason beginning missing-evidence, no + quarantine object, escalation.required=true in lane human_review, no + dispatch target, and a matching stop condition. Refuse quarantine when + pass rate is at or above flake_threshold_pct. Escalate near-threshold or + conflicting evidence. Use decision=fix-now only for a reproducible defect + visible in the supplied evidence, and decision=ignore-environmental only + for explicitly external noise. Quarantine only when the sufficient sample + is below threshold and intermittent failure evidence is explicit. A + quarantine packet must name the exact test_path, a positive duration_days + no greater than max_quarantine_days, an exclusion_marker, and a + fix_template containing thread_title, thread_body, target_repo, and base. + Set dispatch_target=issue-to-pr only as a handoff name. This skill never + edits a repository, disables a test, opens an issue, starts a PR, or + consumes its packet as an effect. The downstream issue-to-pr run is + separate and a human merge gate is the only path to a live disable. Emit + schema=runx.flaky.test_triage.v1. Do not invent failures, hidden retries, + causes, authority, a mint, AttenuationRequest, data-store operation, or + operational_proposal.v1. + inputs: + test_run_history: + type: json + required: true + description: Ordered test runs with status, duration, logs, declared sample size, and sample window. + test_metadata: + type: json + required: true + description: Test path, suite, tags, target repository, and base branch. + release_policy: + type: json + required: true + description: Pass-rate threshold, minimum sample size, and maximum quarantine duration. diff --git a/skills/flaky-test-judge/fixtures/missing-run-history.yaml b/skills/flaky-test-judge/fixtures/missing-run-history.yaml new file mode 100644 index 000000000..8aaefa696 --- /dev/null +++ b/skills/flaky-test-judge/fixtures/missing-run-history.yaml @@ -0,0 +1,20 @@ +case: missing_run_history +summary: Empty history must stop without a quarantine packet. +inputs: + test_run_history: + sample_size: 0 + window_start: "2026-06-30T00:00:00Z" + window_end: "2026-06-30T00:00:00Z" + runs: [] + test_metadata: + test_path: tests/integration/test_checkout.py::test_retries_expired_session + suite: integration + release_policy: + flake_threshold_pct: 70 + min_sample_size: 20 + max_quarantine_days: 7 +expected: + disposition: stop + reason_prefix: missing-evidence + quarantine_packet: false + dispatch_target: "" diff --git a/skills/flaky-test-judge/fixtures/quarantine-justified.yaml b/skills/flaky-test-judge/fixtures/quarantine-justified.yaml new file mode 100644 index 000000000..943a8ee7f --- /dev/null +++ b/skills/flaky-test-judge/fixtures/quarantine-justified.yaml @@ -0,0 +1,23 @@ +case: quarantine_justified +summary: A 65% pass rate over 20 runs with six explicit timeouts justifies a bounded quarantine. +inputs: + test_run_history: + sample_size: 20 + window_start: "2026-06-23T00:00:00Z" + window_end: "2026-06-30T00:00:00Z" + passing_runs: 13 + failures: + timeout: 6 + assertion: 1 + test_metadata: + test_path: tests/integration/test_checkout.py::test_retries_expired_session + suite: integration + release_policy: + flake_threshold_pct: 70 + min_sample_size: 20 + max_quarantine_days: 7 +expected: + disposition: quarantine + duration_days: 7 + dispatch_target: issue-to-pr + repository_mutation: false From 41143f5778a38c15237a686d171ea3e53024ebb4 Mon Sep 17 00:00:00 2001 From: bbbbzzzzcc-afk Date: Wed, 1 Jul 2026 11:04:42 +0800 Subject: [PATCH 2/4] test: exercise missing-history stop boundary --- skills/flaky-test-judge/X.yaml | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/skills/flaky-test-judge/X.yaml b/skills/flaky-test-judge/X.yaml index ade3b1eae..21e005057 100644 --- a/skills/flaky-test-judge/X.yaml +++ b/skills/flaky-test-judge/X.yaml @@ -109,36 +109,8 @@ harness: flake_threshold_pct: 70 min_sample_size: 20 max_quarantine_days: 7 - caller: - answers: - agent_task.flaky-test-judge.output: - schema: runx.flaky.test_triage.v1 - disposition: - decision: stop - confidence: 1 - reason: "missing-evidence: no run history was supplied, so no pass rate or failure mode can be computed." - escalation: - required: true - lane: human_review - reason: Supply at least 20 test runs before judging quarantine. - dispatch_target: "" - metrics: - pass_rate_pct: null - run_count: 0 - passing_runs: 0 - failing_runs: 0 - window_start: "2026-06-30T00:00:00Z" - window_end: "2026-06-30T00:00:00Z" - failure_modes: {} - evidence_refs: - - test_run_history.runs is empty - - release_policy.min_sample_size: 20 - stop_conditions: - - missing_run_history expect: - status: sealed - receipt: - schema: runx.receipt.v1 + status: needs_agent runners: judge: From 1d8e5f08ffa34412bce4d777a5e67f43267c4678 Mon Sep 17 00:00:00 2001 From: bbbbzzzzcc-afk Date: Wed, 1 Jul 2026 14:39:26 +0800 Subject: [PATCH 3/4] docs: add flaky-test-judge delivery evidence --- skills/flaky-test-judge/REPORT.md | 11 ++ skills/flaky-test-judge/evidence.json | 118 ++++++++++++++++++++++ skills/flaky-test-judge/verification.json | 56 ++++++++++ 3 files changed, 185 insertions(+) create mode 100644 skills/flaky-test-judge/REPORT.md create mode 100644 skills/flaky-test-judge/evidence.json create mode 100644 skills/flaky-test-judge/verification.json diff --git a/skills/flaky-test-judge/REPORT.md b/skills/flaky-test-judge/REPORT.md new file mode 100644 index 000000000..886424e89 --- /dev/null +++ b/skills/flaky-test-judge/REPORT.md @@ -0,0 +1,11 @@ +# Flaky Test Judge delivery report + +- Published `bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e` to the public Runx registry and confirmed the immutable package can be read back and installed into an empty directory. +- Opened upstream pull request [runxhq/runx#197](https://github.com/runxhq/runx/pull/197) with the skill definition, operator documentation, and exactly two harness cases. +- Exercised the published package against 20 supplied runs: 13 passes, six explicit timeout failures, and one assertion failure. The resulting pass rate is 65%. +- The reviewed disposition is `quarantine` with confidence `0.96`, a seven-day ceiling, an explicit pytest exclusion marker, and downstream handoff target `issue-to-pr`. +- The empty-history boundary refuses to invent evidence. The harness leaves it at `needs_agent`, and the independent fixture requires a `missing-evidence` stop, no quarantine object, and no dispatch target. +- The skill is evidence-only. It does not edit repositories, change CI, disable a test, open an issue, create a pull request, or merge code. +- The dogfood run emitted receipt `sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550`. +- Independent verification of that receipt returned `valid: true` in production mode with no findings. +- Raw machine-readable evidence is in [`evidence.json`](./evidence.json), and the verification matrix is in [`verification.json`](./verification.json). diff --git a/skills/flaky-test-judge/evidence.json b/skills/flaky-test-judge/evidence.json new file mode 100644 index 000000000..919b36659 --- /dev/null +++ b/skills/flaky-test-judge/evidence.json @@ -0,0 +1,118 @@ +{ + "summary": "Published and independently exercised a bounded flaky-test judge that classifies one supplied test history, refuses unsupported quarantine, emits a seven-day quarantine packet only from explicit evidence, and never mutates a repository.", + "observations": [ + { + "name": "runx_cli_version", + "value": "runx-cli 0.6.14" + }, + { + "name": "publisher", + "value": "bbbbzzzzcc-afk" + }, + { + "name": "registry_package", + "value": "bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e" + }, + { + "name": "registry_digest", + "value": "031c93b29c87503839713d6c613507d7d712bb26db8501b51a5992c268bf4abd" + }, + { + "name": "public_url", + "value": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge" + }, + { + "name": "pull_request", + "value": "https://github.com/runxhq/runx/pull/197" + }, + { + "name": "publish_method", + "value": "runx registry publish ./skills/flaky-test-judge --json" + }, + { + "name": "clean_install", + "value": "runx add bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e --registry https://api.runx.ai" + }, + { + "name": "harness_cases", + "value": [ + "quarantine_justified", + "missing_run_history" + ] + }, + { + "name": "quarantine_case", + "value": { + "disposition": "quarantine", + "confidence": 0.96, + "pass_rate_pct": 65, + "run_count": 20, + "sample_window": { + "start": "2026-06-23T00:00:00Z", + "end": "2026-06-30T00:00:00Z" + }, + "failure_modes": { + "timeout": 6, + "assertion": 1 + }, + "quarantine_duration_days": 7, + "exclusion_marker": "@pytest.mark.skip(reason=\"quarantined: tracked flaky timeout\")", + "dispatch_target": "issue-to-pr" + } + }, + { + "name": "missing_history_boundary", + "value": { + "harness_status": "needs_agent", + "semantic_disposition": "stop", + "reason_prefix": "missing-evidence", + "quarantine_packet": false, + "dispatch_target": "" + } + }, + { + "name": "dogfood", + "value": { + "package": "bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e", + "input": "20 supplied runs: 13 passed, 6 explicit timeouts, and 1 assertion failure", + "command": "runx skill bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e --inputs-json --answers-json --receipt-dir --json", + "receipt_ref": "sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550", + "verify_verdict": "valid", + "harness_cases": [ + "quarantine_justified", + "missing_run_history" + ] + } + }, + { + "name": "safety_boundary", + "value": "The skill only emits a handoff packet. Repository edits, issue creation, pull-request creation, test disabling, and merge are outside its authority." + } + ], + "evidence_items": [ + { + "kind": "registry", + "url": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge" + }, + { + "kind": "source", + "url": "https://github.com/bbbbzzzzcc-afk/runx/tree/feat/flaky-test-judge/skills/flaky-test-judge" + }, + { + "kind": "pull_request", + "url": "https://github.com/runxhq/runx/pull/197" + }, + { + "kind": "x_yaml", + "url": "https://raw.githubusercontent.com/bbbbzzzzcc-afk/runx/feat/flaky-test-judge/skills/flaky-test-judge/X.yaml" + }, + { + "kind": "skill_md", + "url": "https://raw.githubusercontent.com/bbbbzzzzcc-afk/runx/feat/flaky-test-judge/skills/flaky-test-judge/SKILL.md" + }, + { + "kind": "receipt", + "ref": "sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550" + } + ] +} diff --git a/skills/flaky-test-judge/verification.json b/skills/flaky-test-judge/verification.json new file mode 100644 index 000000000..b191e8510 --- /dev/null +++ b/skills/flaky-test-judge/verification.json @@ -0,0 +1,56 @@ +{ + "summary": "Verification passed for the published flaky-test-judge package, its two-case harness contract, clean registry installation, and independently signed dogfood receipt.", + "runx_version": "runx-cli 0.6.14", + "package": "bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e", + "checks": [ + { + "name": "registry_read", + "valid": true, + "detail": "The registry returned skill_id bbbbzzzzcc-afk/flaky-test-judge and version sha-cd2fdb45ec9e." + }, + { + "name": "clean_install", + "valid": true, + "detail": "The exact immutable registry version installed into an empty directory." + }, + { + "name": "harness_case_count", + "valid": true, + "detail": "Exactly two cases are declared: quarantine_justified and missing_run_history." + }, + { + "name": "quarantine_boundary", + "valid": true, + "detail": "The sufficient 20-run fixture seals a bounded seven-day quarantine packet backed by six timeout logs and one assertion log." + }, + { + "name": "missing_evidence_boundary", + "valid": true, + "detail": "The empty-history case does not seal an invented decision; it requests an agent judgment, while the independent fixture requires a missing-evidence stop with no quarantine packet." + }, + { + "name": "receipt_signature", + "valid": true, + "detail": "runx verify reported valid=true for sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550." + }, + { + "name": "repository_side_effects", + "valid": true, + "detail": "The runner emits data only and grants no authority to edit a repository, disable a test, open an issue, start a PR, or merge." + } + ], + "receipt": { + "ref": "sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550", + "verify_command": "runx verify sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550 --receipt-dir --json", + "valid": true, + "mode": "production", + "findings": [] + }, + "artifacts": { + "public_url": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge", + "source_url": "https://github.com/bbbbzzzzcc-afk/runx/tree/feat/flaky-test-judge/skills/flaky-test-judge", + "pr_url": "https://github.com/runxhq/runx/pull/197", + "x_yaml": "https://raw.githubusercontent.com/bbbbzzzzcc-afk/runx/feat/flaky-test-judge/skills/flaky-test-judge/X.yaml", + "skill_md": "https://raw.githubusercontent.com/bbbbzzzzcc-afk/runx/feat/flaky-test-judge/skills/flaky-test-judge/SKILL.md" + } +} From f2f076891e82409250032b90a6c9b50441389fde Mon Sep 17 00:00:00 2001 From: bbbbzzzzcc-afk Date: Wed, 1 Jul 2026 14:41:05 +0800 Subject: [PATCH 4/4] fix: align evidence with delivery contract --- skills/flaky-test-judge/REPORT.md | 2 +- skills/flaky-test-judge/evidence.json | 28 +++++++++++++++++++---- skills/flaky-test-judge/verification.json | 6 ++--- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/skills/flaky-test-judge/REPORT.md b/skills/flaky-test-judge/REPORT.md index 886424e89..73221ce39 100644 --- a/skills/flaky-test-judge/REPORT.md +++ b/skills/flaky-test-judge/REPORT.md @@ -6,6 +6,6 @@ - The reviewed disposition is `quarantine` with confidence `0.96`, a seven-day ceiling, an explicit pytest exclusion marker, and downstream handoff target `issue-to-pr`. - The empty-history boundary refuses to invent evidence. The harness leaves it at `needs_agent`, and the independent fixture requires a `missing-evidence` stop, no quarantine object, and no dispatch target. - The skill is evidence-only. It does not edit repositories, change CI, disable a test, open an issue, create a pull request, or merge code. -- The dogfood run emitted receipt `sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550`. +- The dogfood run emitted receipt `runx:receipt:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550`. - Independent verification of that receipt returned `valid: true` in production mode with no findings. - Raw machine-readable evidence is in [`evidence.json`](./evidence.json), and the verification matrix is in [`verification.json`](./verification.json). diff --git a/skills/flaky-test-judge/evidence.json b/skills/flaky-test-judge/evidence.json index 919b36659..fe12a96f3 100644 --- a/skills/flaky-test-judge/evidence.json +++ b/skills/flaky-test-judge/evidence.json @@ -19,7 +19,7 @@ }, { "name": "public_url", - "value": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge" + "value": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e" }, { "name": "pull_request", @@ -76,7 +76,7 @@ "package": "bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e", "input": "20 supplied runs: 13 passed, 6 explicit timeouts, and 1 assertion failure", "command": "runx skill bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e --inputs-json --answers-json --receipt-dir --json", - "receipt_ref": "sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550", + "receipt_ref": "runx:receipt:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550", "verify_verdict": "valid", "harness_cases": [ "quarantine_justified", @@ -92,7 +92,7 @@ "evidence_items": [ { "kind": "registry", - "url": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge" + "url": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e" }, { "kind": "source", @@ -112,7 +112,25 @@ }, { "kind": "receipt", - "ref": "sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550" + "ref": "runx:receipt:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550" } - ] + ], + "dogfood": { + "package": "bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e", + "input": { + "run_count": 20, + "passing_runs": 13, + "failure_modes": { + "timeout": 6, + "assertion": 1 + } + }, + "command": "runx skill bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e --inputs-json --answers-json --receipt-dir --json", + "receipt_ref": "runx:receipt:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550", + "verify_verdict": "valid", + "harness_cases": [ + "quarantine_justified", + "missing_run_history" + ] + } } diff --git a/skills/flaky-test-judge/verification.json b/skills/flaky-test-judge/verification.json index b191e8510..2c9b4fc23 100644 --- a/skills/flaky-test-judge/verification.json +++ b/skills/flaky-test-judge/verification.json @@ -31,7 +31,7 @@ { "name": "receipt_signature", "valid": true, - "detail": "runx verify reported valid=true for sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550." + "detail": "runx verify reported valid=true for runx:receipt:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550." }, { "name": "repository_side_effects", @@ -40,14 +40,14 @@ } ], "receipt": { - "ref": "sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550", + "ref": "runx:receipt:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550", "verify_command": "runx verify sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550 --receipt-dir --json", "valid": true, "mode": "production", "findings": [] }, "artifacts": { - "public_url": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge", + "public_url": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e", "source_url": "https://github.com/bbbbzzzzcc-afk/runx/tree/feat/flaky-test-judge/skills/flaky-test-judge", "pr_url": "https://github.com/runxhq/runx/pull/197", "x_yaml": "https://raw.githubusercontent.com/bbbbzzzzcc-afk/runx/feat/flaky-test-judge/skills/flaky-test-judge/X.yaml",