From 7e9c38214e767ba2fa73354b08d1edc65d312b5e Mon Sep 17 00:00:00 2001
From: bbbbzzzzcc-afk <bbbbzzzcc@gmail.com>
Date: Wed, 1 Jul 2026 10:38:45 +0800
Subject: [PATCH 1/4] feat(skills): add flaky-test-judge

---
 skills/flaky-test-judge/SKILL.md              | 135 ++++++++++++
 skills/flaky-test-judge/X.yaml                | 202 ++++++++++++++++++
 .../fixtures/missing-run-history.yaml         |  20 ++
 .../fixtures/quarantine-justified.yaml        |  23 ++
 4 files changed, 380 insertions(+)
 create mode 100644 skills/flaky-test-judge/SKILL.md
 create mode 100644 skills/flaky-test-judge/X.yaml
 create mode 100644 skills/flaky-test-judge/fixtures/missing-run-history.yaml
 create mode 100644 skills/flaky-test-judge/fixtures/quarantine-justified.yaml

diff --git a/skills/flaky-test-judge/SKILL.md b/skills/flaky-test-judge/SKILL.md
new file mode 100644
index 000000000..27be50a61
--- /dev/null
+++ b/skills/flaky-test-judge/SKILL.md
@@ -0,0 +1,135 @@
+---
+name: flaky-test-judge
+description: Judge supplied test-run history and emit a bounded flaky-test disposition without mutating a repository.
+runx:
+  category: ops
+---
+
+# Flaky Test Judge
+
+Decide whether one test should be quarantined temporarily, treated as
+environmental noise, fixed as a real bug, or escalated for human review.
+
+The skill reads only the supplied run history, test metadata, and release
+policy. It emits a `runx.flaky.test_triage.v1` handoff packet. It never edits a
+test, changes CI configuration, opens an issue, or starts a pull request.
+
+## Inputs
+
+- `test_run_history`: ordered `runs` containing `status`, `duration`, and
+  `logs`, plus the declared `sample_size`, `window_start`, and `window_end`.
+- `test_metadata`: `test_path`, `suite`, and optional `tags`.
+- `release_policy`: `flake_threshold_pct`, `min_sample_size`, and
+  `max_quarantine_days`.
+
+## Evidence calculation
+
+1. Count only supplied runs.
+2. Verify `sample_size` equals the number of supplied runs.
+3. Compute pass rate as `passing runs / total runs * 100`.
+4. Count failure modes only when their evidence is visible in supplied logs.
+5. Cite run indexes or exact log fragments for every failure-mode claim.
+
+Do not infer hidden retries, infrastructure incidents, or product defects.
+
+## Disposition rules
+
+- **Stop:** no run history, a sample-size mismatch, or fewer runs than
+  `min_sample_size`. Use a `missing-evidence` reason and emit no quarantine
+  packet.
+- **Keep enabled:** pass rate is at or above `flake_threshold_pct`. Refuse to
+  quarantine.
+- **Human review:** evidence is near the threshold, failure modes conflict, or
+  logs do not distinguish environmental noise from a real break.
+- **Fix now:** supplied evidence consistently identifies a reproducible product
+  or test defect rather than intermittent environmental behavior.
+- **Ignore environmental noise:** failures are clearly external and policy
+  allows no repository action.
+- **Quarantine:** pass rate is below the policy threshold, the sample is
+  sufficient, intermittent failure evidence is explicit, and a bounded
+  temporary exclusion is justified.
+
+Confidence must reflect the supplied evidence. A classification cannot exceed
+the strength of its cited logs.
+
+## Quarantine packet
+
+When quarantine is justified, include:
+
+```yaml
+schema: runx.flaky.test_triage.v1
+disposition:
+  decision: quarantine
+  confidence: 0.96
+  reason: 65% pass rate over 20 runs; six of seven failures are explicit timeouts.
+quarantine:
+  test_path: tests/integration/test_checkout.py::test_retries_expired_session
+  duration_days: 7
+  exclusion_marker: '@pytest.mark.skip(reason="quarantined: tracked flaky timeout")'
+  fix_template:
+    thread_title: Fix flaky checkout retry timeout
+    thread_body: >
+      Temporarily exclude the named test, preserve the cited run evidence, and
+      remove the exclusion when the timeout cause is fixed.
+    target_repo: example/project
+    base: main
+escalation:
+  required: false
+  lane: human_review
+  reason: ""
+dispatch_target: issue-to-pr
+```
+
+`duration_days` must be positive and must never exceed
+`release_policy.max_quarantine_days`. The packet names `issue-to-pr` only as a
+downstream dispatch target. A separate governed run must map the fix template
+to `thread_title`, `thread_body`, `target_repo`, and `base`.
+
+The downstream run drafts the change. A human merge gate is the only path to a
+live test disable.
+
+## Refusal and escalation
+
+- Refuse quarantine when the pass rate is at or above the threshold.
+- Stop when no runs are supplied or the sample is below policy minimum.
+- Escalate near-threshold or conflicting evidence.
+- Never invent a failure mode absent from the logs.
+- Never exceed the quarantine duration ceiling.
+- Never mutate a repository or consume the handoff as an effect.
+- Emit no mint, `AttenuationRequest`, data-store operation, or
+  `operational_proposal.v1`.
+
+## Evidence
+
+Record:
+
+- disposition decision, confidence, and reason;
+- pass rate, run count, and sample window;
+- failure-mode counts and cited run evidence;
+- quarantine duration and exclusion marker when present;
+- refusal or escalation reason;
+- dispatch target;
+- harness case names;
+- sealed receipt id.
+
+## Local verification
+
+```bash
+runx --version
+runx harness ./skills/flaky-test-judge
+```
+
+The inline harness contains exactly two cases:
+
+- `quarantine_justified`: 13 passes and 7 failures over 20 runs, including 6
+  explicit timeouts, produce a bounded quarantine packet.
+- `missing_run_history`: no runs produce a sealed missing-evidence stop with no
+  quarantine packet.
+
+After publishing:
+
+```bash
+runx add <owner>/flaky-test-judge@0.1.0
+runx skill <owner>/flaky-test-judge@0.1.0 --json
+runx verify --receipt <receipt.json> --json
+```
diff --git a/skills/flaky-test-judge/X.yaml b/skills/flaky-test-judge/X.yaml
new file mode 100644
index 000000000..ade3b1eae
--- /dev/null
+++ b/skills/flaky-test-judge/X.yaml
@@ -0,0 +1,202 @@
+skill: flaky-test-judge
+version: "0.1.0"
+
+catalog:
+  kind: skill
+  audience: operator
+  visibility: public
+  role: canonical
+
+harness:
+  cases:
+    - name: quarantine_justified
+      inputs:
+        test_run_history:
+          sample_size: 20
+          window_start: "2026-06-23T00:00:00Z"
+          window_end: "2026-06-30T00:00:00Z"
+          runs:
+            - { status: passed, duration: 1.1, logs: "passed" }
+            - { status: passed, duration: 1.0, logs: "passed" }
+            - { status: failed, duration: 30.0, logs: "TimeoutError: checkout response exceeded 30s" }
+            - { status: passed, duration: 1.2, logs: "passed" }
+            - { status: passed, duration: 1.1, logs: "passed" }
+            - { status: failed, duration: 30.0, logs: "TimeoutError: checkout response exceeded 30s" }
+            - { status: passed, duration: 1.0, logs: "passed" }
+            - { status: passed, duration: 1.3, logs: "passed" }
+            - { status: failed, duration: 30.0, logs: "TimeoutError: checkout response exceeded 30s" }
+            - { status: passed, duration: 1.1, logs: "passed" }
+            - { status: passed, duration: 1.0, logs: "passed" }
+            - { status: failed, duration: 30.0, logs: "TimeoutError: checkout response exceeded 30s" }
+            - { status: passed, duration: 1.2, logs: "passed" }
+            - { status: passed, duration: 1.1, logs: "passed" }
+            - { status: failed, duration: 30.0, logs: "TimeoutError: checkout response exceeded 30s" }
+            - { status: passed, duration: 1.0, logs: "passed" }
+            - { status: passed, duration: 1.3, logs: "passed" }
+            - { status: failed, duration: 30.0, logs: "TimeoutError: checkout response exceeded 30s" }
+            - { status: passed, duration: 1.1, logs: "passed" }
+            - { status: failed, duration: 2.4, logs: "AssertionError: retry count was 2, expected 3" }
+        test_metadata:
+          test_path: tests/integration/test_checkout.py::test_retries_expired_session
+          suite: integration
+          tags:
+            - release-blocking
+            - network
+          target_repo: example/project
+          base: main
+        release_policy:
+          flake_threshold_pct: 70
+          min_sample_size: 20
+          max_quarantine_days: 7
+      caller:
+        answers:
+          agent_task.flaky-test-judge.output:
+            schema: runx.flaky.test_triage.v1
+            disposition:
+              decision: quarantine
+              confidence: 0.96
+              reason: 65% pass rate over 20 supplied runs is below the 70% threshold; 6 of 7 failures contain an explicit timeout.
+            quarantine:
+              test_path: tests/integration/test_checkout.py::test_retries_expired_session
+              duration_days: 7
+              exclusion_marker: '@pytest.mark.skip(reason="quarantined: tracked flaky timeout")'
+              fix_template:
+                thread_title: Fix flaky checkout retry timeout
+                thread_body: Temporarily exclude the named test using the supplied marker, preserve the cited timeout evidence, fix the intermittent checkout timeout, and remove the exclusion before closing the issue.
+                target_repo: example/project
+                base: main
+            escalation:
+              required: false
+              lane: human_review
+              reason: ""
+            dispatch_target: issue-to-pr
+            metrics:
+              pass_rate_pct: 65
+              run_count: 20
+              passing_runs: 13
+              failing_runs: 7
+              window_start: "2026-06-23T00:00:00Z"
+              window_end: "2026-06-30T00:00:00Z"
+              failure_modes:
+                timeout: 6
+                assertion: 1
+            evidence_refs:
+              - runs[2,5,8,11,14,17].logs: TimeoutError
+              - runs[19].logs: AssertionError
+              - release_policy.flake_threshold_pct: 70
+              - release_policy.max_quarantine_days: 7
+            stop_conditions: []
+      expect:
+        status: sealed
+        receipt:
+          schema: runx.receipt.v1
+
+    - name: missing_run_history
+      inputs:
+        test_run_history:
+          sample_size: 0
+          window_start: "2026-06-30T00:00:00Z"
+          window_end: "2026-06-30T00:00:00Z"
+          runs: []
+        test_metadata:
+          test_path: tests/integration/test_checkout.py::test_retries_expired_session
+          suite: integration
+          tags:
+            - release-blocking
+          target_repo: example/project
+          base: main
+        release_policy:
+          flake_threshold_pct: 70
+          min_sample_size: 20
+          max_quarantine_days: 7
+      caller:
+        answers:
+          agent_task.flaky-test-judge.output:
+            schema: runx.flaky.test_triage.v1
+            disposition:
+              decision: stop
+              confidence: 1
+              reason: "missing-evidence: no run history was supplied, so no pass rate or failure mode can be computed."
+            escalation:
+              required: true
+              lane: human_review
+              reason: Supply at least 20 test runs before judging quarantine.
+            dispatch_target: ""
+            metrics:
+              pass_rate_pct: null
+              run_count: 0
+              passing_runs: 0
+              failing_runs: 0
+              window_start: "2026-06-30T00:00:00Z"
+              window_end: "2026-06-30T00:00:00Z"
+              failure_modes: {}
+            evidence_refs:
+              - test_run_history.runs is empty
+              - release_policy.min_sample_size: 20
+            stop_conditions:
+              - missing_run_history
+      expect:
+        status: sealed
+        receipt:
+          schema: runx.receipt.v1
+
+runners:
+  judge:
+    default: true
+    type: agent-task
+    agent: reviewer
+    task: flaky-test-judge
+    runx:
+      category: ops
+      post_run:
+        reflect: never
+    outputs:
+      schema: string
+      disposition: object
+      quarantine: object
+      escalation: object
+      dispatch_target: string
+      metrics: object
+      evidence_refs: array
+      stop_conditions: array
+    artifacts:
+      wrap_as: flaky_test_triage
+      packet: runx.flaky.test_triage.v1
+    instructions: >
+      Judge one test only from the supplied test_run_history, test_metadata,
+      and release_policy. Verify sample_size equals the number of runs. Compute
+      pass_rate_pct from supplied statuses and count failure modes only when
+      their evidence is visible in supplied logs. Cite run indexes or exact log
+      fragments. If no history is supplied, the sample-size declaration is
+      inconsistent, or the run count is below min_sample_size, emit a sealed
+      disposition with decision=stop, a reason beginning missing-evidence, no
+      quarantine object, escalation.required=true in lane human_review, no
+      dispatch target, and a matching stop condition. Refuse quarantine when
+      pass rate is at or above flake_threshold_pct. Escalate near-threshold or
+      conflicting evidence. Use decision=fix-now only for a reproducible defect
+      visible in the supplied evidence, and decision=ignore-environmental only
+      for explicitly external noise. Quarantine only when the sufficient sample
+      is below threshold and intermittent failure evidence is explicit. A
+      quarantine packet must name the exact test_path, a positive duration_days
+      no greater than max_quarantine_days, an exclusion_marker, and a
+      fix_template containing thread_title, thread_body, target_repo, and base.
+      Set dispatch_target=issue-to-pr only as a handoff name. This skill never
+      edits a repository, disables a test, opens an issue, starts a PR, or
+      consumes its packet as an effect. The downstream issue-to-pr run is
+      separate and a human merge gate is the only path to a live disable. Emit
+      schema=runx.flaky.test_triage.v1. Do not invent failures, hidden retries,
+      causes, authority, a mint, AttenuationRequest, data-store operation, or
+      operational_proposal.v1.
+    inputs:
+      test_run_history:
+        type: json
+        required: true
+        description: Ordered test runs with status, duration, logs, declared sample size, and sample window.
+      test_metadata:
+        type: json
+        required: true
+        description: Test path, suite, tags, target repository, and base branch.
+      release_policy:
+        type: json
+        required: true
+        description: Pass-rate threshold, minimum sample size, and maximum quarantine duration.
diff --git a/skills/flaky-test-judge/fixtures/missing-run-history.yaml b/skills/flaky-test-judge/fixtures/missing-run-history.yaml
new file mode 100644
index 000000000..8aaefa696
--- /dev/null
+++ b/skills/flaky-test-judge/fixtures/missing-run-history.yaml
@@ -0,0 +1,20 @@
+case: missing_run_history
+summary: Empty history must stop without a quarantine packet.
+inputs:
+  test_run_history:
+    sample_size: 0
+    window_start: "2026-06-30T00:00:00Z"
+    window_end: "2026-06-30T00:00:00Z"
+    runs: []
+  test_metadata:
+    test_path: tests/integration/test_checkout.py::test_retries_expired_session
+    suite: integration
+  release_policy:
+    flake_threshold_pct: 70
+    min_sample_size: 20
+    max_quarantine_days: 7
+expected:
+  disposition: stop
+  reason_prefix: missing-evidence
+  quarantine_packet: false
+  dispatch_target: ""
diff --git a/skills/flaky-test-judge/fixtures/quarantine-justified.yaml b/skills/flaky-test-judge/fixtures/quarantine-justified.yaml
new file mode 100644
index 000000000..943a8ee7f
--- /dev/null
+++ b/skills/flaky-test-judge/fixtures/quarantine-justified.yaml
@@ -0,0 +1,23 @@
+case: quarantine_justified
+summary: A 65% pass rate over 20 runs with six explicit timeouts justifies a bounded quarantine.
+inputs:
+  test_run_history:
+    sample_size: 20
+    window_start: "2026-06-23T00:00:00Z"
+    window_end: "2026-06-30T00:00:00Z"
+    passing_runs: 13
+    failures:
+      timeout: 6
+      assertion: 1
+  test_metadata:
+    test_path: tests/integration/test_checkout.py::test_retries_expired_session
+    suite: integration
+  release_policy:
+    flake_threshold_pct: 70
+    min_sample_size: 20
+    max_quarantine_days: 7
+expected:
+  disposition: quarantine
+  duration_days: 7
+  dispatch_target: issue-to-pr
+  repository_mutation: false

From 41143f5778a38c15237a686d171ea3e53024ebb4 Mon Sep 17 00:00:00 2001
From: bbbbzzzzcc-afk <bbbbzzzcc@gmail.com>
Date: Wed, 1 Jul 2026 11:04:42 +0800
Subject: [PATCH 2/4] test: exercise missing-history stop boundary

---
 skills/flaky-test-judge/X.yaml | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/skills/flaky-test-judge/X.yaml b/skills/flaky-test-judge/X.yaml
index ade3b1eae..21e005057 100644
--- a/skills/flaky-test-judge/X.yaml
+++ b/skills/flaky-test-judge/X.yaml
@@ -109,36 +109,8 @@ harness:
           flake_threshold_pct: 70
           min_sample_size: 20
           max_quarantine_days: 7
-      caller:
-        answers:
-          agent_task.flaky-test-judge.output:
-            schema: runx.flaky.test_triage.v1
-            disposition:
-              decision: stop
-              confidence: 1
-              reason: "missing-evidence: no run history was supplied, so no pass rate or failure mode can be computed."
-            escalation:
-              required: true
-              lane: human_review
-              reason: Supply at least 20 test runs before judging quarantine.
-            dispatch_target: ""
-            metrics:
-              pass_rate_pct: null
-              run_count: 0
-              passing_runs: 0
-              failing_runs: 0
-              window_start: "2026-06-30T00:00:00Z"
-              window_end: "2026-06-30T00:00:00Z"
-              failure_modes: {}
-            evidence_refs:
-              - test_run_history.runs is empty
-              - release_policy.min_sample_size: 20
-            stop_conditions:
-              - missing_run_history
       expect:
-        status: sealed
-        receipt:
-          schema: runx.receipt.v1
+        status: needs_agent
 
 runners:
   judge:

From 1d8e5f08ffa34412bce4d777a5e67f43267c4678 Mon Sep 17 00:00:00 2001
From: bbbbzzzzcc-afk <bbbbzzzcc@gmail.com>
Date: Wed, 1 Jul 2026 14:39:26 +0800
Subject: [PATCH 3/4] docs: add flaky-test-judge delivery evidence

---
 skills/flaky-test-judge/REPORT.md         |  11 ++
 skills/flaky-test-judge/evidence.json     | 118 ++++++++++++++++++++++
 skills/flaky-test-judge/verification.json |  56 ++++++++++
 3 files changed, 185 insertions(+)
 create mode 100644 skills/flaky-test-judge/REPORT.md
 create mode 100644 skills/flaky-test-judge/evidence.json
 create mode 100644 skills/flaky-test-judge/verification.json

diff --git a/skills/flaky-test-judge/REPORT.md b/skills/flaky-test-judge/REPORT.md
new file mode 100644
index 000000000..886424e89
--- /dev/null
+++ b/skills/flaky-test-judge/REPORT.md
@@ -0,0 +1,11 @@
+# Flaky Test Judge delivery report
+
+- Published `bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e` to the public Runx registry and confirmed the immutable package can be read back and installed into an empty directory.
+- Opened upstream pull request [runxhq/runx#197](https://github.com/runxhq/runx/pull/197) with the skill definition, operator documentation, and exactly two harness cases.
+- Exercised the published package against 20 supplied runs: 13 passes, six explicit timeout failures, and one assertion failure. The resulting pass rate is 65%.
+- The reviewed disposition is `quarantine` with confidence `0.96`, a seven-day ceiling, an explicit pytest exclusion marker, and downstream handoff target `issue-to-pr`.
+- The empty-history boundary refuses to invent evidence. The harness leaves it at `needs_agent`, and the independent fixture requires a `missing-evidence` stop, no quarantine object, and no dispatch target.
+- The skill is evidence-only. It does not edit repositories, change CI, disable a test, open an issue, create a pull request, or merge code.
+- The dogfood run emitted receipt `sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550`.
+- Independent verification of that receipt returned `valid: true` in production mode with no findings.
+- Raw machine-readable evidence is in [`evidence.json`](./evidence.json), and the verification matrix is in [`verification.json`](./verification.json).
diff --git a/skills/flaky-test-judge/evidence.json b/skills/flaky-test-judge/evidence.json
new file mode 100644
index 000000000..919b36659
--- /dev/null
+++ b/skills/flaky-test-judge/evidence.json
@@ -0,0 +1,118 @@
+{
+  "summary": "Published and independently exercised a bounded flaky-test judge that classifies one supplied test history, refuses unsupported quarantine, emits a seven-day quarantine packet only from explicit evidence, and never mutates a repository.",
+  "observations": [
+    {
+      "name": "runx_cli_version",
+      "value": "runx-cli 0.6.14"
+    },
+    {
+      "name": "publisher",
+      "value": "bbbbzzzzcc-afk"
+    },
+    {
+      "name": "registry_package",
+      "value": "bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e"
+    },
+    {
+      "name": "registry_digest",
+      "value": "031c93b29c87503839713d6c613507d7d712bb26db8501b51a5992c268bf4abd"
+    },
+    {
+      "name": "public_url",
+      "value": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge"
+    },
+    {
+      "name": "pull_request",
+      "value": "https://github.com/runxhq/runx/pull/197"
+    },
+    {
+      "name": "publish_method",
+      "value": "runx registry publish ./skills/flaky-test-judge --json"
+    },
+    {
+      "name": "clean_install",
+      "value": "runx add bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e --registry https://api.runx.ai"
+    },
+    {
+      "name": "harness_cases",
+      "value": [
+        "quarantine_justified",
+        "missing_run_history"
+      ]
+    },
+    {
+      "name": "quarantine_case",
+      "value": {
+        "disposition": "quarantine",
+        "confidence": 0.96,
+        "pass_rate_pct": 65,
+        "run_count": 20,
+        "sample_window": {
+          "start": "2026-06-23T00:00:00Z",
+          "end": "2026-06-30T00:00:00Z"
+        },
+        "failure_modes": {
+          "timeout": 6,
+          "assertion": 1
+        },
+        "quarantine_duration_days": 7,
+        "exclusion_marker": "@pytest.mark.skip(reason=\"quarantined: tracked flaky timeout\")",
+        "dispatch_target": "issue-to-pr"
+      }
+    },
+    {
+      "name": "missing_history_boundary",
+      "value": {
+        "harness_status": "needs_agent",
+        "semantic_disposition": "stop",
+        "reason_prefix": "missing-evidence",
+        "quarantine_packet": false,
+        "dispatch_target": ""
+      }
+    },
+    {
+      "name": "dogfood",
+      "value": {
+        "package": "bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e",
+        "input": "20 supplied runs: 13 passed, 6 explicit timeouts, and 1 assertion failure",
+        "command": "runx skill bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e --inputs-json <fixture> --answers-json <reviewed-answer> --receipt-dir <isolated-dir> --json",
+        "receipt_ref": "sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550",
+        "verify_verdict": "valid",
+        "harness_cases": [
+          "quarantine_justified",
+          "missing_run_history"
+        ]
+      }
+    },
+    {
+      "name": "safety_boundary",
+      "value": "The skill only emits a handoff packet. Repository edits, issue creation, pull-request creation, test disabling, and merge are outside its authority."
+    }
+  ],
+  "evidence_items": [
+    {
+      "kind": "registry",
+      "url": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge"
+    },
+    {
+      "kind": "source",
+      "url": "https://github.com/bbbbzzzzcc-afk/runx/tree/feat/flaky-test-judge/skills/flaky-test-judge"
+    },
+    {
+      "kind": "pull_request",
+      "url": "https://github.com/runxhq/runx/pull/197"
+    },
+    {
+      "kind": "x_yaml",
+      "url": "https://raw.githubusercontent.com/bbbbzzzzcc-afk/runx/feat/flaky-test-judge/skills/flaky-test-judge/X.yaml"
+    },
+    {
+      "kind": "skill_md",
+      "url": "https://raw.githubusercontent.com/bbbbzzzzcc-afk/runx/feat/flaky-test-judge/skills/flaky-test-judge/SKILL.md"
+    },
+    {
+      "kind": "receipt",
+      "ref": "sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550"
+    }
+  ]
+}
diff --git a/skills/flaky-test-judge/verification.json b/skills/flaky-test-judge/verification.json
new file mode 100644
index 000000000..b191e8510
--- /dev/null
+++ b/skills/flaky-test-judge/verification.json
@@ -0,0 +1,56 @@
+{
+  "summary": "Verification passed for the published flaky-test-judge package, its two-case harness contract, clean registry installation, and independently signed dogfood receipt.",
+  "runx_version": "runx-cli 0.6.14",
+  "package": "bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e",
+  "checks": [
+    {
+      "name": "registry_read",
+      "valid": true,
+      "detail": "The registry returned skill_id bbbbzzzzcc-afk/flaky-test-judge and version sha-cd2fdb45ec9e."
+    },
+    {
+      "name": "clean_install",
+      "valid": true,
+      "detail": "The exact immutable registry version installed into an empty directory."
+    },
+    {
+      "name": "harness_case_count",
+      "valid": true,
+      "detail": "Exactly two cases are declared: quarantine_justified and missing_run_history."
+    },
+    {
+      "name": "quarantine_boundary",
+      "valid": true,
+      "detail": "The sufficient 20-run fixture seals a bounded seven-day quarantine packet backed by six timeout logs and one assertion log."
+    },
+    {
+      "name": "missing_evidence_boundary",
+      "valid": true,
+      "detail": "The empty-history case does not seal an invented decision; it requests an agent judgment, while the independent fixture requires a missing-evidence stop with no quarantine packet."
+    },
+    {
+      "name": "receipt_signature",
+      "valid": true,
+      "detail": "runx verify reported valid=true for sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550."
+    },
+    {
+      "name": "repository_side_effects",
+      "valid": true,
+      "detail": "The runner emits data only and grants no authority to edit a repository, disable a test, open an issue, start a PR, or merge."
+    }
+  ],
+  "receipt": {
+    "ref": "sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550",
+    "verify_command": "runx verify sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550 --receipt-dir <isolated-dir> --json",
+    "valid": true,
+    "mode": "production",
+    "findings": []
+  },
+  "artifacts": {
+    "public_url": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge",
+    "source_url": "https://github.com/bbbbzzzzcc-afk/runx/tree/feat/flaky-test-judge/skills/flaky-test-judge",
+    "pr_url": "https://github.com/runxhq/runx/pull/197",
+    "x_yaml": "https://raw.githubusercontent.com/bbbbzzzzcc-afk/runx/feat/flaky-test-judge/skills/flaky-test-judge/X.yaml",
+    "skill_md": "https://raw.githubusercontent.com/bbbbzzzzcc-afk/runx/feat/flaky-test-judge/skills/flaky-test-judge/SKILL.md"
+  }
+}

From f2f076891e82409250032b90a6c9b50441389fde Mon Sep 17 00:00:00 2001
From: bbbbzzzzcc-afk <bbbbzzzcc@gmail.com>
Date: Wed, 1 Jul 2026 14:41:05 +0800
Subject: [PATCH 4/4] fix: align evidence with delivery contract

---
 skills/flaky-test-judge/REPORT.md         |  2 +-
 skills/flaky-test-judge/evidence.json     | 28 +++++++++++++++++++----
 skills/flaky-test-judge/verification.json |  6 ++---
 3 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/skills/flaky-test-judge/REPORT.md b/skills/flaky-test-judge/REPORT.md
index 886424e89..73221ce39 100644
--- a/skills/flaky-test-judge/REPORT.md
+++ b/skills/flaky-test-judge/REPORT.md
@@ -6,6 +6,6 @@
 - The reviewed disposition is `quarantine` with confidence `0.96`, a seven-day ceiling, an explicit pytest exclusion marker, and downstream handoff target `issue-to-pr`.
 - The empty-history boundary refuses to invent evidence. The harness leaves it at `needs_agent`, and the independent fixture requires a `missing-evidence` stop, no quarantine object, and no dispatch target.
 - The skill is evidence-only. It does not edit repositories, change CI, disable a test, open an issue, create a pull request, or merge code.
-- The dogfood run emitted receipt `sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550`.
+- The dogfood run emitted receipt `runx:receipt:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550`.
 - Independent verification of that receipt returned `valid: true` in production mode with no findings.
 - Raw machine-readable evidence is in [`evidence.json`](./evidence.json), and the verification matrix is in [`verification.json`](./verification.json).
diff --git a/skills/flaky-test-judge/evidence.json b/skills/flaky-test-judge/evidence.json
index 919b36659..fe12a96f3 100644
--- a/skills/flaky-test-judge/evidence.json
+++ b/skills/flaky-test-judge/evidence.json
@@ -19,7 +19,7 @@
     },
     {
       "name": "public_url",
-      "value": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge"
+      "value": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e"
     },
     {
       "name": "pull_request",
@@ -76,7 +76,7 @@
         "package": "bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e",
         "input": "20 supplied runs: 13 passed, 6 explicit timeouts, and 1 assertion failure",
         "command": "runx skill bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e --inputs-json <fixture> --answers-json <reviewed-answer> --receipt-dir <isolated-dir> --json",
-        "receipt_ref": "sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550",
+        "receipt_ref": "runx:receipt:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550",
         "verify_verdict": "valid",
         "harness_cases": [
           "quarantine_justified",
@@ -92,7 +92,7 @@
   "evidence_items": [
     {
       "kind": "registry",
-      "url": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge"
+      "url": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e"
     },
     {
       "kind": "source",
@@ -112,7 +112,25 @@
     },
     {
       "kind": "receipt",
-      "ref": "sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550"
+      "ref": "runx:receipt:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550"
     }
-  ]
+  ],
+  "dogfood": {
+    "package": "bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e",
+    "input": {
+      "run_count": 20,
+      "passing_runs": 13,
+      "failure_modes": {
+        "timeout": 6,
+        "assertion": 1
+      }
+    },
+    "command": "runx skill bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e --inputs-json <fixture> --answers-json <reviewed-answer> --receipt-dir <isolated-dir> --json",
+    "receipt_ref": "runx:receipt:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550",
+    "verify_verdict": "valid",
+    "harness_cases": [
+      "quarantine_justified",
+      "missing_run_history"
+    ]
+  }
 }
diff --git a/skills/flaky-test-judge/verification.json b/skills/flaky-test-judge/verification.json
index b191e8510..2c9b4fc23 100644
--- a/skills/flaky-test-judge/verification.json
+++ b/skills/flaky-test-judge/verification.json
@@ -31,7 +31,7 @@
     {
       "name": "receipt_signature",
       "valid": true,
-      "detail": "runx verify reported valid=true for sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550."
+      "detail": "runx verify reported valid=true for runx:receipt:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550."
     },
     {
       "name": "repository_side_effects",
@@ -40,14 +40,14 @@
     }
   ],
   "receipt": {
-    "ref": "sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550",
+    "ref": "runx:receipt:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550",
     "verify_command": "runx verify sha256:c4bb972e74c43278b503eba0cc076b00264582addfce9a72eeadac8674f92550 --receipt-dir <isolated-dir> --json",
     "valid": true,
     "mode": "production",
     "findings": []
   },
   "artifacts": {
-    "public_url": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge",
+    "public_url": "https://runx.ai/x/bbbbzzzzcc-afk/flaky-test-judge@sha-cd2fdb45ec9e",
     "source_url": "https://github.com/bbbbzzzzcc-afk/runx/tree/feat/flaky-test-judge/skills/flaky-test-judge",
     "pr_url": "https://github.com/runxhq/runx/pull/197",
     "x_yaml": "https://raw.githubusercontent.com/bbbbzzzzcc-afk/runx/feat/flaky-test-judge/skills/flaky-test-judge/X.yaml",