diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 9866a31..40ba377 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -38,6 +38,12 @@ "skills": "./", "description": "Create and manage worktrees safely and consistently across projects while avoiding stale branch bases." }, + { + "name": "semantic-slicing", + "source": "./skills/semantic-slicing", + "skills": "./", + "description": "Turn a large repo into reviewable semantic slices with evidence. Use code shape, threat candidates, issue clusters, and support chatter together so review budget lands on the right parts of the system." + }, { "name": "technical-deslop", "source": "./skills/technical-deslop", diff --git a/README.md b/README.md index e375ff9..9f04670 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ This is my personal **.skills** repository for Codex, Cursor, OpenClaw and agent | `ghcrawl-cluster-operator` | Operate ghcrawl local sync, clustering, and cluster inspection from the CLI. | `npx skills add vincentkoc/dotskills --skill ghcrawl-cluster-operator -y` | | `openclaw-github-dedupe` | Triage GitHub issue/PR clusters across repositories, preserving contributor credit while selecting canonical fixes and safe closures. | `npx skills add vincentkoc/dotskills --skill openclaw-github-dedupe -y` | | `operations-worktree` | Create safe git worktrees from fresh remote defaults instead of stale local branches. | `npx skills add vincentkoc/dotskills --skill operations-worktree -y` | +| `semantic-slicing` | Build local feature, threat, issue, and support maps for focused repo review. | `npx skills add vincentkoc/dotskills --skill semantic-slicing -y` | | `technical-deslop` | Ship clean diffs fast: remove AI noise and keep behavior unchanged. | `npx skills add vincentkoc/dotskills --skill technical-deslop -y` | | `technical-documentation` | Produce dev‑ready docs: clear, structured build/review for brownfield + evergreen. | `npx skills add vincentkoc/dotskills --skill technical-documentation -y` | | `technical-integrations` | Design integrations that land: vendor‑agnostic API/RFC/SDK plans with rollout safety. | `npx skills add vincentkoc/dotskills --skill technical-integrations -y` | @@ -57,6 +58,7 @@ npx skills add vincentkoc/dotskills --skill openclaw-github-dedupe -y npx skills add vincentkoc/dotskills --skill operations-worktree -y npx skills add vincentkoc/dotskills --skill crawlkit -y npx skills add vincentkoc/dotskills --skill graincrawl -y +npx skills add vincentkoc/dotskills --skill semantic-slicing -y ``` List available public skills: diff --git a/catalog.yaml b/catalog.yaml index bfb5c02..fa0721b 100644 --- a/catalog.yaml +++ b/catalog.yaml @@ -123,3 +123,10 @@ skills: source: local tags: [git, worktree, operations, workflow] version: 0.1.0 + + - id: semantic-slicing + name: Semantic Slicing + path: skills/semantic-slicing + source: local + tags: [semantic-slicing, security, review, visualization, gitcrawl, discrawl] + version: 0.1.0 diff --git a/releases/skills.json b/releases/skills.json index c277f41..b82261b 100644 --- a/releases/skills.json +++ b/releases/skills.json @@ -31,6 +31,12 @@ "description": "Create and manage worktrees safely and consistently across projects while avoiding stale branch bases.", "install": "npx skills add https://github.com/vincentkoc/dotskills --skill operations-worktree -y" }, + { + "name": "semantic-slicing", + "source": "./skills/semantic-slicing", + "description": "Turn a large repo into reviewable semantic slices with evidence. Use code shape, threat candidates, issue clusters, and support chatter together so review budget lands on the right parts of the system.", + "install": "npx skills add https://github.com/vincentkoc/dotskills --skill semantic-slicing -y" + }, { "name": "technical-deslop", "source": "./skills/technical-deslop", diff --git a/skills/semantic-slicing/SKILL.md b/skills/semantic-slicing/SKILL.md new file mode 100644 index 0000000..6b2ac4d --- /dev/null +++ b/skills/semantic-slicing/SKILL.md @@ -0,0 +1,86 @@ +--- +name: semantic-slicing +description: Build local semantic review slices by combining clawpatch feature maps, deepsec threat candidates, visual review maps, and optional gitcrawl/discrawl evidence for repos such as openclaw/openclaw. +license: AGPL-3.0-only +metadata: + source: "https://github.com/vincentkoc/dotskills" +--- + +# Semantic Slicing + +## Purpose + +Turn a large repo into reviewable semantic slices with evidence. Use code shape, threat candidates, issue clusters, and support chatter together so review budget lands on the right parts of the system. + +Default stance: map locally first, rank second, spend agent/security-review budget last. + +## When to use + +- Setting up or running `openclaw/clawpatch` against a target repo. +- Setting up or running `vercel-labs/deepsec` against a target repo. +- Producing a local visual map of feature slices, risky files, ownership clusters, or review targets. +- Cross-checking code slices against `gitcrawl` issue/PR data or `discrawl` Discord/support data. +- Planning a focused security, regression, architecture, or maintainer-review pass for a large repo. + +## Workflow + +1. Create a scratch run directory outside the target checkout, usually `~/.semantic-slicing//`. +2. Read target repo instructions before scanning. For OpenClaw, read root `AGENTS.md`; subtree guides matter when reviewing a slice. +3. Verify tool setup: + - `clawpatch`: clone/build `openclaw/clawpatch`, then run `clawpatch init`, `clawpatch map`, `clawpatch status`. + - `deepsec`: clone/build `vercel-labs/deepsec`, scaffold a scratch workspace, then run `deepsec scan`. + - `gitcrawl`: run `gitcrawl doctor --json`, then pull clusters/threads for related issue evidence. + - `discrawl`: run `discrawl doctor --json` and `discrawl status --json`; use search/digest only when support chatter is relevant. +4. Run deterministic maps before AI review: + - Clawpatch feature map for entrypoints/packages/config/test slices. + - Deepsec regex scan for candidate threat surfaces. + - Optional gitcrawl/discrawl lookups for historical pain around the same files, components, or symptoms. +5. Run `scripts/semantic-map.mjs` to merge the local artifacts into `semantic-map.html` and `semantic-map.json`. +6. Rank slices by combined signal: + - high-risk entrypoint or tool boundary, + - deepsec candidate density and slug quality, + - recent/open gitcrawl clusters, + - matching discrawl support terms, + - churn/ownership/test gaps if available. +7. Choose a cost size before running AI stages: + - `low`: deterministic maps only; no `deepsec process` or real `clawpatch review`. + - `medium`: one to three explicit files/features with high-risk slugs, batch size 1, concurrency 1, and a turn cap. + - `high`: broader AI processing or multiple feature reviews; requires an explicit budget/time decision. +8. Run AI only at the chosen size: + - `clawpatch review --feature ` or a small `--limit`. + - `deepsec process --files ` or tightly scoped `--filter` plus `--only-slugs`. +9. Report exact artifact paths, run IDs, counts, cost size, exclusions, and skipped expensive stages. + +## Inputs + +- `target_repo`: local checkout path and/or GitHub `owner/repo`. +- `scratch_root`: local artifact directory, default `~/.semantic-slicing//`. +- `clawpatch_repo`: local clone of `openclaw/clawpatch`, optional if `clawpatch` is already on PATH. +- `deepsec_repo`: local clone of `vercel-labs/deepsec`, optional if `deepsec` is already on PATH. +- `focus`: optional path prefixes, issue numbers, slugs, components, or channels to prioritize. +- `cost_size`: `low`, `medium`, or `high`; default `low`. +- `budget_mode`: `map-only`, `targeted-ai`, or `full-ai`; default follows `cost_size`. + +## Outputs + +- Tool setup status and blocker list. +- Clawpatch feature counts and contamination checks. +- Deepsec scan run ID, candidate counts, top slugs, and top files. +- Optional gitcrawl cluster/thread evidence and discrawl support evidence. +- Local visual map: `semantic-map.html` plus machine-readable `semantic-map.json`. +- Ranked slice plan with recommended next commands and cost-size rationale. + +## Guardrails + +- Keep generated artifacts out of the target repo unless the user explicitly wants checked-in config. +- Do not run full `deepsec process` or broad `clawpatch review` without an explicit high-cost decision; these can be expensive and noisy. +- Treat local nested worktrees and dot-agent folders as contamination unless intentionally in scope: `.claude/`, `.codex/`, `.agents/`, `.deepsec/`, `.semantic-slicing/`. +- If a tool maps contaminated paths, post-filter before ranking and call out the upstream limitation. +- Never paste secrets from scan outputs. Scrub absolute personal paths before external PRs/comments. +- For OpenClaw, use Testbox/Crabbox only when the task moves from mapping into validation. + +## References + +- Read `references/workflow.md` for concrete local setup and run commands. +- Read `references/slicing-taxonomy.md` when choosing slice types or map layers. +- Read `references/openclaw-profile.md` when the target is `openclaw/openclaw`. diff --git a/skills/semantic-slicing/agents/openai.yaml b/skills/semantic-slicing/agents/openai.yaml new file mode 100644 index 0000000..bcf1cfa --- /dev/null +++ b/skills/semantic-slicing/agents/openai.yaml @@ -0,0 +1,10 @@ +interface: + display_name: "Semantic Slicing" + short_description: "Build local feature, threat, issue, and support maps for focused repo review." + icon_small: "./assets/icon.jpg" + icon_large: "./assets/icon.jpg" + brand_color: "#111827" + default_prompt: "Create a semantic slice map for this repo using local clawpatch/deepsec artifacts and gitcrawl/discrawl evidence where relevant." + +policy: + allow_implicit_invocation: true diff --git a/skills/semantic-slicing/assets/icon.jpg b/skills/semantic-slicing/assets/icon.jpg new file mode 100644 index 0000000..8c8b22a Binary files /dev/null and b/skills/semantic-slicing/assets/icon.jpg differ diff --git a/skills/semantic-slicing/references/openclaw-profile.md b/skills/semantic-slicing/references/openclaw-profile.md new file mode 100644 index 0000000..7c52caf --- /dev/null +++ b/skills/semantic-slicing/references/openclaw-profile.md @@ -0,0 +1,59 @@ +# OpenClaw Profile + +Target repo: `openclaw/openclaw`. + +## High-signal buckets + +- `src/agents/**`: agent tools, shell/process/file access, sandbox policy, session reuse. +- `src/gateway/**`: protocol, auth, WebSocket, delivery, live runtime. +- `src/plugins/**`: plugin discovery, registry, activation, manifest/public-surface loaders. +- `extensions/*`: bundled plugin runtime boundaries, channel/provider behavior. +- `packages/memory-host-sdk/**`: storage, embeddings, remote HTTP, SSRF and proxy controls. +- `scripts/**`: release, CI, Docker, package, generated contract checks. +- `ui/**` and apps: local app/browser boundary, WebView and bridge surfaces. + +## Default excludes + +Treat these as contamination unless explicitly requested: + +```text +.git/ +.claude/ +.codex/ +.agents/ +.deepsec/ +.semantic-slicing/ +node_modules/ +dist/ +build/ +coverage/ +.next/ +.turbo/ +``` + +## Local probe on 2026-05-16 + +Observed setup results on `openclaw/openclaw`: + +- `clawpatch` built locally and mapped 1,099 feature records. +- The map included hundreds of `.claude`/`.codex`/`.agents` path references even with a config exclude. Post-filtering is required before review queue ranking. +- `deepsec` built locally and scanned OpenClaw in 53.7 seconds. +- Deepsec scan run ID: `20260516011830-96433ac3b3b6762a`. +- Deepsec found 4,055 pending candidate files and 9,628 total matcher hits. +- Highest-volume slugs were `insecure-crypto`, `agent-tool-definition`, `process-env-access`, `secret-in-log`, and `spread-operator-injection`. +- `gitcrawl doctor --json` showed local OpenClaw data but the last sync was older than the current date, so use it as shortlist context and verify live state with `gh` before mutating. +- `discrawl doctor --json` was healthy in git-share mode; `discrawl status --json` showed share update needed. + +Hydrated follow-up on the same day: + +- Hydrated `deepsec` with OpenClaw-specific `INFO.md` and priority/ignore config. +- Fresh scan run ID: `20260516014350-082402b74eb441df`. +- Fresh scan found 4,050 candidate files and 9,579 total matcher hits. +- One targeted AI process pass on `src/agents/pi-embedded-runner/run/attempt.ts` produced 0 findings, cost `$4.794546`, and used 236,258 input tokens plus 6,065,152 cache-read tokens. +- That file mapped to clawpatch feature `feat_library_997fa9c066`; dry-run review returned `wouldReview: 1`. + +Operational implication: default to `low` cost sizing for maps and queue building. Use `medium` only for file-explicit high-risk slices. Treat broad `deepsec process` as `high` cost unless the user has set a clear budget. + +## OpenClaw verification rule + +Mapping is not validation. If a slice leads to a code change, follow OpenClaw repo rules for targeted tests and Testbox/Crabbox proof before handoff. diff --git a/skills/semantic-slicing/references/slicing-taxonomy.md b/skills/semantic-slicing/references/slicing-taxonomy.md new file mode 100644 index 0000000..63d2db9 --- /dev/null +++ b/skills/semantic-slicing/references/slicing-taxonomy.md @@ -0,0 +1,109 @@ +# Slicing Taxonomy + +Use multiple slice layers. One lens is too easy to game. + +## Feature slices + +Source: clawpatch feature records. + +Best for: +- package/API/plugin boundaries, +- route or command entrypoints, +- test ownership, +- review units for targeted agent passes. + +Watch for: +- hidden worktrees, +- generated output, +- overly broad source-group slices, +- missing framework-specific entrypoints. + +## Threat slices + +Source: deepsec file records and candidates. + +Best for: +- path traversal, SSRF, RCE, auth bypass, secret/log surfaces, +- high-density files, +- framework mismatch gaps where default matchers are weak. + +Watch for: +- noisy slug families like generic crypto usage, +- candidate count without exploitability, +- processing cost. + +## Issue slices + +Source: gitcrawl clusters, threads, summaries, live GitHub checks. + +Best for: +- repeated user-visible failures, +- closed-but-recurring regressions, +- duplicate PR/issue clusters, +- maintainer narrative and shipped history. + +Watch for: +- stale local crawl state, +- title similarity without root-cause match, +- closed clusters that should only inform risk, not current truth. + +## Support slices + +Source: discrawl search, digest, analytics. + +Best for: +- Discord/support symptom clusters, +- community vocabulary that differs from GitHub titles, +- production pain not yet filed as issues. + +Watch for: +- private chatter leakage, +- stale share sync, +- unstructured complaints without repro. + +## Diff slices + +Source: `git diff`, PR file lists, changed-lines metadata. + +Best for: +- PR review, +- regression-focused scans, +- verifying whether a fix touches the real symptom path. + +Watch for: +- tests/docs-only changes that should not expand into a full security scan, +- moved files that break naive path matching. + +## Runtime/import slices + +Source: import graph, startup profiles, package manifests, plugin manifests. + +Best for: +- hot-path performance, +- lazy-loading regressions, +- plugin/core boundary leakage, +- package/dependency ownership. + +Watch for: +- circular imports, +- static+dynamic imports of the same heavy module, +- core/plugin boundary violations. + +## Visual review map + +Map each bucket with: +- feature count, +- deepsec candidate count, +- top slugs, +- top files, +- gitcrawl cluster count, +- discrawl hit count, +- contamination count. + +Recommended ranking: + +```text +score = entrypoint_weight + threat_density + issue_signal + support_signal + churn_signal - contamination_penalty +``` + +The score is a review queue, not a bug claim. diff --git a/skills/semantic-slicing/references/workflow.md b/skills/semantic-slicing/references/workflow.md new file mode 100644 index 0000000..1eafd12 --- /dev/null +++ b/skills/semantic-slicing/references/workflow.md @@ -0,0 +1,153 @@ +# Local Semantic Workflow + +## Scratch layout + +Use a run directory outside the target checkout: + +```bash +RUN_ROOT="$HOME/.semantic-slicing/openclaw/$(date +%Y%m%d-%H%M%S)" +mkdir -p "$RUN_ROOT" +``` + +Recommended layout: + +```text +/ + clawpatch/ + deepsec/ + semantic-map.html + semantic-map.json +``` + +## Clawpatch + +Setup from source: + +```bash +git clone https://github.com/openclaw/clawpatch.git ~/GIT/_Perso/clawpatch +cd ~/GIT/_Perso/clawpatch +pnpm install +pnpm build +``` + +Run against a target repo: + +```bash +node ~/GIT/_Perso/clawpatch/dist/cli.js \ + --root ~/GIT/_Perso/openclaw \ + --state-dir "$RUN_ROOT/clawpatch" \ + init --json + +node ~/GIT/_Perso/clawpatch/dist/cli.js \ + --root ~/GIT/_Perso/openclaw \ + --state-dir "$RUN_ROOT/clawpatch" \ + map --json + +node ~/GIT/_Perso/clawpatch/dist/cli.js \ + --root ~/GIT/_Perso/openclaw \ + --state-dir "$RUN_ROOT/clawpatch" \ + status --json +``` + +After mapping, check contamination: + +```bash +find "$RUN_ROOT/clawpatch/features" -type f -print0 | + xargs -0 jq -r '.ownedFiles[].path, .entrypoints[].path, .contextFiles[].path' | + rg '^\.(claude|codex|agents|deepsec|semantic-slicing)/' | wc -l +``` + +If contamination is non-zero, post-filter before ranking. Current clawpatch may still seed hidden local worktree paths even when config excludes are present. + +## Deepsec + +Setup from source: + +```bash +git clone https://github.com/vercel-labs/deepsec.git ~/GIT/_Perso/deepsec +cd ~/GIT/_Perso/deepsec +pnpm install +pnpm -r build +pnpm bundle +``` + +Create a scratch workspace and link the local build: + +```bash +node ~/GIT/_Perso/deepsec/packages/deepsec/dist/cli.mjs \ + init "$RUN_ROOT/deepsec" ~/GIT/_Perso/openclaw --id openclaw --force + +cd "$RUN_ROOT/deepsec" +pnpm add -w "deepsec@file:$HOME/GIT/_Perso/deepsec/packages/deepsec" +``` + +Run deterministic scan: + +```bash +node ~/GIT/_Perso/deepsec/packages/deepsec/dist/cli.mjs scan --project-id openclaw +node ~/GIT/_Perso/deepsec/packages/deepsec/dist/cli.mjs status --project-id openclaw +node ~/GIT/_Perso/deepsec/packages/deepsec/dist/cli.mjs metrics --project-id openclaw +``` + +Do not run `process` blindly on large candidate sets. Size the run first: + +| Size | Use when | Shape | +| --- | --- | --- | +| `low` | You need a review map or queue only. | `clawpatch map`, `deepsec scan`, `semantic-map.mjs`; no AI processing. | +| `medium` | A slice has strong threat + feature overlap. | 1-3 files/features, high-risk slugs only, `--batch-size 1`, `--concurrency 1`, capped turns. | +| `high` | The user explicitly wants broad AI review and accepts cost/time. | Multiple features or wider filters; record budget, run IDs, and stopping condition first. | + +Prefer file-explicit processing after ranking: + +```bash +pnpm deepsec process --project-id openclaw \ + --files src/agents/pi-tools.read.ts,src/agents/sandbox/ssh-backend.ts \ + --only-slugs path-traversal,rce,ssrf,auth-bypass,missing-auth,secret-in-log \ + --batch-size 1 \ + --concurrency 1 \ + --max-turns 40 +``` + +If you only need matcher-level narrowing: + +```bash +pnpm deepsec scan --project-id openclaw --matchers path-traversal,rce,ssrf +``` + +## Gitcrawl + +Use gitcrawl as issue/PR memory, not live truth: + +```bash +gitcrawl doctor --json +gitcrawl clusters openclaw/openclaw --min-size 2 --limit 20 --sort size --json +gitcrawl threads openclaw/openclaw --numbers 123,456 --include-closed --json +``` + +For freshness, re-check decisive open/closed/merged state with `gh` before mutation. + +## Discrawl + +Use discrawl for support/channel evidence when user reports mention Discord, support chatter, or community symptoms: + +```bash +discrawl doctor --json +discrawl status --json +discrawl search "gateway auth" --limit 25 --json +discrawl digest --help +``` + +Avoid pulling personal or unrelated message content into reports. Summarize only the symptom evidence needed to rank the slice. + +## Visual map + +Generate the local review map: + +```bash +node /path/to/semantic-slicing/scripts/semantic-map.mjs \ + --clawpatch "$RUN_ROOT/clawpatch" \ + --deepsec "$RUN_ROOT/deepsec/data/openclaw" \ + --out "$RUN_ROOT/semantic-map.html" +``` + +The script writes both HTML and JSON. Use the HTML for review and the JSON for follow-up automation. diff --git a/skills/semantic-slicing/scripts/semantic-map.mjs b/skills/semantic-slicing/scripts/semantic-map.mjs new file mode 100755 index 0000000..11f273c --- /dev/null +++ b/skills/semantic-slicing/scripts/semantic-map.mjs @@ -0,0 +1,310 @@ +#!/usr/bin/env node +import fs from "node:fs"; +import path from "node:path"; + +const args = parseArgs(process.argv.slice(2)); +if (!args.clawpatch && !args.deepsec) { + die("usage: semantic-map.mjs --clawpatch --deepsec --out "); +} + +const outPath = args.out ?? path.resolve(process.cwd(), "semantic-map.html"); +const contaminationPrefixes = [ + ".git/", + ".claude/", + ".codex/", + ".agents/", + ".deepsec/", + ".semantic-slicing/", + "node_modules/", + "dist/", + "build/", + "coverage/", + ".next/", + ".turbo/", +]; + +const features = args.clawpatch ? readClawpatchFeatures(args.clawpatch) : []; +const deepsecFiles = args.deepsec ? readDeepsecFiles(args.deepsec) : []; +const buckets = new Map(); +const slugCounts = new Map(); +const topFiles = []; + +for (const feature of features) { + const refs = [ + ...(feature.ownedFiles ?? []), + ...(feature.entrypoints ?? []), + ...(feature.contextFiles ?? []), + ].map((item) => item.path).filter(Boolean); + const contaminated = refs.filter(isContaminated); + const cleanRefs = refs.filter((item) => !isContaminated(item)); + if (cleanRefs.length === 0 && contaminated.length > 0) continue; + const bucket = bucketFor(preferredFeaturePath(feature, cleanRefs)); + const record = ensureBucket(buckets, bucket); + record.features += 1; + record.kinds[feature.kind ?? "unknown"] = (record.kinds[feature.kind ?? "unknown"] ?? 0) + 1; + record.sources[feature.source ?? "unknown"] = (record.sources[feature.source ?? "unknown"] ?? 0) + 1; + record.contaminatedRefs += contaminated.length; + if (feature.featureId && record.featureIds.length < 12) record.featureIds.push(feature.featureId); +} + +for (const file of deepsecFiles) { + if (isContaminated(file.filePath)) continue; + const candidates = Array.isArray(file.candidates) ? file.candidates : []; + if (candidates.length === 0) continue; + const bucket = bucketFor(file.filePath); + const record = ensureBucket(buckets, bucket); + record.deepsecFiles += 1; + record.deepsecCandidates += candidates.length; + for (const candidate of candidates) { + const slug = candidate.vulnSlug ?? candidate.slug ?? "unknown"; + record.slugs[slug] = (record.slugs[slug] ?? 0) + 1; + slugCounts.set(slug, (slugCounts.get(slug) ?? 0) + 1); + } + topFiles.push({ + path: file.filePath, + bucket, + candidates: candidates.length, + slugs: [...new Set(candidates.map((item) => item.vulnSlug ?? item.slug ?? "unknown"))].slice(0, 8), + }); +} + +const bucketRows = [...buckets.values()] + .map((bucket) => ({ + ...bucket, + score: bucket.features + bucket.deepsecCandidates + bucket.deepsecFiles * 3 - bucket.contaminatedRefs, + topSlugs: topEntries(bucket.slugs, 8), + topKinds: topEntries(bucket.kinds, 5), + topSources: topEntries(bucket.sources, 5), + })) + .sort((a, b) => b.score - a.score || b.deepsecCandidates - a.deepsecCandidates || a.name.localeCompare(b.name)); + +topFiles.sort((a, b) => b.candidates - a.candidates || a.path.localeCompare(b.path)); + +const summary = { + generatedAt: new Date().toISOString(), + inputs: { + clawpatch: args.clawpatch ?? null, + deepsec: args.deepsec ?? null, + }, + totals: { + buckets: bucketRows.length, + features: bucketRows.reduce((sum, item) => sum + item.features, 0), + deepsecFiles: bucketRows.reduce((sum, item) => sum + item.deepsecFiles, 0), + deepsecCandidates: bucketRows.reduce((sum, item) => sum + item.deepsecCandidates, 0), + contaminatedRefs: bucketRows.reduce((sum, item) => sum + item.contaminatedRefs, 0), + }, + topSlugs: topEntries(Object.fromEntries(slugCounts), 20), + buckets: bucketRows, + topFiles: topFiles.slice(0, 100), +}; + +fs.mkdirSync(path.dirname(outPath), { recursive: true }); +fs.writeFileSync(outPath, renderHtml(summary), "utf8"); +fs.writeFileSync(outPath.replace(/\.html?$/u, "") + ".json", JSON.stringify(summary, null, 2) + "\n", "utf8"); +console.log(`wrote ${outPath}`); +console.log(`wrote ${outPath.replace(/\.html?$/u, "")}.json`); + +function parseArgs(argv) { + const out = {}; + for (let index = 0; index < argv.length; index += 1) { + const arg = argv[index]; + if (!arg.startsWith("--")) die(`unexpected argument: ${arg}`); + const key = arg.slice(2); + const value = argv[index + 1]; + if (!value || value.startsWith("--")) die(`missing value for ${arg}`); + out[key] = value; + index += 1; + } + return out; +} + +function readClawpatchFeatures(stateDir) { + const featureDir = path.join(stateDir, "features"); + if (!fs.existsSync(featureDir)) return []; + return readJsonFiles(featureDir); +} + +function readDeepsecFiles(projectDataDir) { + const fileDir = path.join(projectDataDir, "files"); + if (!fs.existsSync(fileDir)) return []; + return readJsonFiles(fileDir); +} + +function readJsonFiles(dir) { + const files = []; + for (const entry of fs.readdirSync(dir, { withFileTypes: true })) { + const full = path.join(dir, entry.name); + if (entry.isDirectory()) { + files.push(...readJsonFiles(full)); + } else if (entry.isFile() && entry.name.endsWith(".json")) { + try { + files.push(JSON.parse(fs.readFileSync(full, "utf8"))); + } catch { + // Keep map generation best-effort; corrupt records are skipped. + } + } + } + return files; +} + +function bucketFor(filePath) { + const normalized = String(filePath).replaceAll("\\", "/").replace(/^\.?\//u, ""); + const parts = normalized.split("/"); + if (parts[0] === "extensions" && parts[1]) return `extensions/${parts[1]}`; + if (parts[0] === "packages" && parts[1]) return `packages/${parts[1]}`; + if (parts[0] === "apps" && parts[1]) return `apps/${parts[1]}`; + if (parts[0] === "src" && parts[1]) return `src/${parts[1]}`; + if (parts[0] === "ui") return parts[1] === "src" && parts[2] ? `ui/${parts[2]}` : "ui"; + if (parts[0] === "scripts") return "scripts"; + if (parts[0] === "docs") return "docs"; + return parts[0] || "unknown"; +} + +function preferredFeaturePath(feature, cleanRefs) { + const nonManifest = cleanRefs.find((item) => path.basename(item) !== "package.json"); + if (nonManifest) return nonManifest; + const titlePath = titlePathHint(feature?.title); + if (titlePath) return titlePath; + return cleanRefs[0] ?? feature?.title ?? "unknown"; +} + +function titlePathHint(title) { + if (!title) return null; + const match = String(title).match(/\b(?:Node source|Project config|Package script|CLI command)\s+([^#(]+)/u); + if (!match) return null; + return match[1].trim(); +} + +function ensureBucket(map, name) { + if (!map.has(name)) { + map.set(name, { + name, + features: 0, + deepsecFiles: 0, + deepsecCandidates: 0, + contaminatedRefs: 0, + kinds: {}, + sources: {}, + slugs: {}, + featureIds: [], + }); + } + return map.get(name); +} + +function isContaminated(filePath) { + const normalized = String(filePath).replaceAll("\\", "/").replace(/^\.?\//u, ""); + return contaminationPrefixes.some((prefix) => normalized === prefix.slice(0, -1) || normalized.startsWith(prefix)); +} + +function topEntries(object, limit) { + return Object.entries(object) + .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])) + .slice(0, limit) + .map(([name, count]) => ({ name, count })); +} + +function renderHtml(summary) { + const maxScore = Math.max(...summary.buckets.map((item) => item.score), 1); + const nodes = summary.buckets.slice(0, 32).map((bucket, index) => { + const col = index % 8; + const row = Math.floor(index / 8); + const radius = 18 + Math.round((bucket.score / maxScore) * 34); + return { + ...bucket, + x: 70 + col * 135, + y: 75 + row * 125, + radius, + }; + }); + const width = 1080; + const height = Math.max(260, 135 + Math.ceil(nodes.length / 8) * 125); + return ` + + + + +Semantic Slice Map + + + +
+

Semantic Slice Map

+
Generated ${escapeHtml(summary.generatedAt)}
+
+${card("Buckets", summary.totals.buckets)} +${card("Features", summary.totals.features)} +${card("Deepsec files", summary.totals.deepsecFiles)} +${card("Candidates", summary.totals.deepsecCandidates)} +
+

Review Map

+ +${nodes.map((node) => ` 100 ? "mid" : ""}" cx="${node.x}" cy="${node.y}" r="${node.radius}">${escapeHtml(node.name)}: score ${node.score}, features ${node.features}, candidates ${node.deepsecCandidates} +${escapeHtml(shortLabel(node.name, 18))} +${node.features}f / ${node.deepsecCandidates}c`).join("\n")} + +

Top Buckets

+ + + +${summary.buckets.slice(0, 40).map((bucket) => ``).join("\n")} + +
BucketScoreFeaturesFilesCandidatesTop slugs
${escapeHtml(bucket.name)}${bucket.score}${bucket.features}${bucket.deepsecFiles}${bucket.deepsecCandidates}${pills(bucket.topSlugs)}
+

Top Files

+ + + +${summary.topFiles.slice(0, 50).map((file) => ``).join("\n")} + +
FileBucketCandidatesSlugs
${escapeHtml(file.path)}${escapeHtml(file.bucket)}${file.candidates}${file.slugs.map((slug) => `${escapeHtml(slug)}`).join("")}
+
+ +`; +} + +function card(label, value) { + return `
${Number(value).toLocaleString()}${escapeHtml(label)}
`; +} + +function pills(entries) { + return entries.map((entry) => `${escapeHtml(entry.name)} ${entry.count}`).join(""); +} + +function shortLabel(value, limit) { + return value.length > limit ? `${value.slice(0, limit - 1)}…` : value; +} + +function escapeHtml(value) { + return String(value) + .replaceAll("&", "&") + .replaceAll("<", "<") + .replaceAll(">", ">") + .replaceAll('"', """); +} + +function die(message) { + console.error(message); + process.exit(2); +}