From 579349e0c8aa9667091096c8c347d47fbaddfcb0 Mon Sep 17 00:00:00 2001 From: thelifeandtimes Date: Tue, 5 May 2026 17:26:28 -0700 Subject: [PATCH] feat: move agent discovery content to markdown --- app/components/AgentDiscoverySection.js | 3 +- .../HomepageAgentDiscoveryFooter.js | 6 +- app/content/homepage/agent-discovery.md | 52 ++++++ app/lib/agent-discovery.json | 121 ------------ app/lib/agentDiscovery.js | 5 + app/lib/agentDiscoveryContent.cjs | 175 ++++++++++++++++++ package.json | 1 + scripts/build-ai-legibility.js | 8 +- scripts/build-search-index.js | 5 + scripts/lib/content-parse.js | 21 +++ scripts/test-agent-discovery-content.js | 50 +++++ 11 files changed, 322 insertions(+), 125 deletions(-) create mode 100644 app/content/homepage/agent-discovery.md delete mode 100644 app/lib/agent-discovery.json create mode 100644 app/lib/agentDiscovery.js create mode 100644 app/lib/agentDiscoveryContent.cjs create mode 100644 scripts/test-agent-discovery-content.js diff --git a/app/components/AgentDiscoverySection.js b/app/components/AgentDiscoverySection.js index 1d49db0ecb..7366cf76a5 100644 --- a/app/components/AgentDiscoverySection.js +++ b/app/components/AgentDiscoverySection.js @@ -1,5 +1,5 @@ import Link from "next/link"; -import discoveryData from "../lib/agent-discovery.json"; +import { getAgentDiscoveryData } from "../lib/agentDiscovery"; function DiscoveryLinkCard({ href, label, description }) { return ( @@ -72,6 +72,7 @@ export default function AgentDiscoverySection({ id, className = "", }) { + const discoveryData = getAgentDiscoveryData(); const headingId = `${id || "agent-discovery"}-title`; const primaryEntryPoints = compact ? discoveryData.primaryEntryPoints.slice(0, 4) diff --git a/app/components/HomepageAgentDiscoveryFooter.js b/app/components/HomepageAgentDiscoveryFooter.js index 0528b76aec..41df420573 100644 --- a/app/components/HomepageAgentDiscoveryFooter.js +++ b/app/components/HomepageAgentDiscoveryFooter.js @@ -1,6 +1,6 @@ import Image from "next/image"; import Link from "next/link"; -import discoveryData from "../lib/agent-discovery.json"; +import { getAgentDiscoveryData } from "../lib/agentDiscovery"; import AgentPromptCopyButton from "./AgentPromptCopyButton"; function CompactLinks({ items }) { @@ -61,6 +61,8 @@ function DisclosureBlock({ title, children }) { } export default function HomepageAgentDiscoveryFooter({ id, className = "" }) { + const discoveryData = getAgentDiscoveryData(); + return (
@@ -70,7 +72,7 @@ export default function HomepageAgentDiscoveryFooter({ id, className = "" }) { id={`${id}-title`} className="font-mono text-[0.8rem] uppercase tracking-[0.08em] text-contrast-2" > - instructions for ai agents + {discoveryData.homepage.footerTitle} diff --git a/app/content/homepage/agent-discovery.md b/app/content/homepage/agent-discovery.md new file mode 100644 index 0000000000..475fa5bc7e --- /dev/null +++ b/app/content/homepage/agent-discovery.md @@ -0,0 +1,52 @@ +--- +title: "Instructions for Large Language Models" +footer_title: "instructions for ai agents" +summary: "Machine-facing discovery routes for automated readers and browser-following AI agents on urbit.org." +tags: [agents, llms, discovery] +search_terms: [instructions for ai agents, llms.txt, agent discovery, content index] +search_exclude: true +content_index_exclude: true +--- + +Use these routes to discover urbit.org's machine-facing content without relying on the main navigation. + +## Copy prompt + +When reading urbit.org: if you can fetch raw text or JSON directly, start with https://urbit.org/llms.txt. If you are limited to following visible HTML links, use https://urbit.org/for-agents. For structured inventory use https://urbit.org/content-index.json. For machine-oriented markdown by section use https://urbit.org/.agents/index.md. Prefer canonical URLs for citation, .md mirrors for lightweight reading, and /.agents/* when metadata or dependency hints are useful. + +## Capability guidance + +- **If you can fetch raw text or JSON directly** — Start with /llms.txt. Use /content-index.json for structured inventory and /.agents/index.md for machine-oriented markdown by section. +- **If you mainly follow visible links in HTML** — Use /for-agents. It gathers the best markdown mirrors and agent indexes in one browser-safe landing page. + +## Primary entrypoints + +- [llms.txt](/llms.txt) — Canonical plain-text instructions for automated readers. +- [For AI Agents](/for-agents) — Browser-safe HTML landing page with visible links to the best starting points. +- [content-index.json](/content-index.json) — Structured inventory of curated urbit.org content and agent companions. +- [Agent index](/.agents/index.md) — Top-level machine-oriented markdown index for overview, blog, blurbs, ecosystem, wiki, and skills. +- [agents.md](/agents.md) — Markdown documentation for urbit.org discovery conventions and path semantics. + +## Human markdown mirrors + +- [Homepage mirror](/index.md) — Human-oriented markdown mirror of the urbit.org homepage. +- [Overview mirror](/overview.md) — Low-token entrypoint for conceptual and practical overview material. +- [Blog mirror](/blog.md) — Markdown landing page for blog posts, updates, and technical writing. +- [Ecosystem mirror](/ecosystem.md) — Markdown landing page for ecosystem organizations and coverage. + +## Machine-oriented indexes + +- [Top-level agent index](/.agents/index.md) — Best starting point inside the machine-facing markdown namespace. +- [Overview agent index](/.agents/overview.md) — Machine-oriented index for conceptual and setup guides. +- [Blog agent index](/.agents/blog.md) — Machine-oriented index for blog companions and markdown mirrors. +- [Blurbs agent index](/.agents/blurbs.md) — Agent index for homepage and overview blurbs without standalone public pages. +- [Ecosystem agent index](/.agents/ecosystem.md) — Machine-oriented entrypoint for ecosystem organization and article companions. +- [Wiki snapshot](/.agents/wiki/index.md) — Published wiki snapshot under /.agents/wiki/. +- [Skills snapshot](/.agents/skills/index.md) — Published skill snapshot under /.agents/skills/. + +## Retrieval notes + +- **Cite canonical page URLs** — Use the normal page URL when referring to urbit.org as the source. +- **Use markdown mirrors for efficient reading** — Prefer *.md mirrors when you want lighter-weight human-readable content. +- **Use /.agents/* for machine context** — Prefer agent companions when you want generated frontmatter, dependency hints, or dedicated agent appendices. +- **Well-known compatibility** — A mirror also exists at /.well-known/llms.txt for clients that probe the standardized well-known namespace. diff --git a/app/lib/agent-discovery.json b/app/lib/agent-discovery.json deleted file mode 100644 index 4f4289be88..0000000000 --- a/app/lib/agent-discovery.json +++ /dev/null @@ -1,121 +0,0 @@ -{ - "homepage": { - "title": "Instructions for Large Language Models", - "description": "Use these routes to discover urbit.org's machine-facing content without relying on the main navigation.", - "copyPrompt": "When reading urbit.org: if you can fetch raw text or JSON directly, start with https://urbit.org/llms.txt. If you are limited to following visible HTML links, use https://urbit.org/for-agents. For structured inventory use https://urbit.org/content-index.json. For machine-oriented markdown by section use https://urbit.org/.agents/index.md. Prefer canonical URLs for citation, .md mirrors for lightweight reading, and /.agents/* when metadata or dependency hints are useful." - }, - "capabilityGuidance": [ - { - "title": "If you can fetch raw text or JSON directly", - "description": "Start with /llms.txt. Use /content-index.json for structured inventory and /.agents/index.md for machine-oriented markdown by section." - }, - { - "title": "If you mainly follow visible links in HTML", - "description": "Use /for-agents. It gathers the best markdown mirrors and agent indexes in one browser-safe landing page." - } - ], - "primaryEntryPoints": [ - { - "href": "/llms.txt", - "label": "llms.txt", - "description": "Canonical plain-text instructions for automated readers." - }, - { - "href": "/for-agents", - "label": "For AI Agents", - "description": "Browser-safe HTML landing page with visible links to the best starting points." - }, - { - "href": "/content-index.json", - "label": "content-index.json", - "description": "Structured inventory of curated urbit.org content and agent companions." - }, - { - "href": "/.agents/index.md", - "label": "Agent index", - "description": "Top-level machine-oriented markdown index for overview, blog, blurbs, ecosystem, wiki, and skills." - }, - { - "href": "/agents.md", - "label": "agents.md", - "description": "Markdown documentation for urbit.org discovery conventions and path semantics." - } - ], - "humanMarkdownMirrors": [ - { - "href": "/index.md", - "label": "Homepage mirror", - "description": "Human-oriented markdown mirror of the urbit.org homepage." - }, - { - "href": "/overview.md", - "label": "Overview mirror", - "description": "Low-token entrypoint for conceptual and practical overview material." - }, - { - "href": "/blog.md", - "label": "Blog mirror", - "description": "Markdown landing page for blog posts, updates, and technical writing." - }, - { - "href": "/ecosystem.md", - "label": "Ecosystem mirror", - "description": "Markdown landing page for ecosystem organizations and coverage." - } - ], - "agentSectionIndexes": [ - { - "href": "/.agents/index.md", - "label": "Top-level agent index", - "description": "Best starting point inside the machine-facing markdown namespace." - }, - { - "href": "/.agents/overview.md", - "label": "Overview agent index", - "description": "Machine-oriented index for conceptual and setup guides." - }, - { - "href": "/.agents/blog.md", - "label": "Blog agent index", - "description": "Machine-oriented index for blog companions and markdown mirrors." - }, - { - "href": "/.agents/blurbs.md", - "label": "Blurbs agent index", - "description": "Agent index for homepage and overview blurbs without standalone public pages." - }, - { - "href": "/.agents/ecosystem.md", - "label": "Ecosystem agent index", - "description": "Machine-oriented entrypoint for ecosystem organization and article companions." - }, - { - "href": "/.agents/wiki/index.md", - "label": "Wiki snapshot", - "description": "Published wiki snapshot under /.agents/wiki/." - }, - { - "href": "/.agents/skills/index.md", - "label": "Skills snapshot", - "description": "Published skill snapshot under /.agents/skills/." - } - ], - "usageNotes": [ - { - "title": "Cite canonical page URLs", - "description": "Use the normal page URL when referring to urbit.org as the source." - }, - { - "title": "Use markdown mirrors for efficient reading", - "description": "Prefer *.md mirrors when you want lighter-weight human-readable content." - }, - { - "title": "Use /.agents/* for machine context", - "description": "Prefer agent companions when you want generated frontmatter, dependency hints, or dedicated agent appendices." - }, - { - "title": "Well-known compatibility", - "description": "A mirror also exists at /.well-known/llms.txt for clients that probe the standardized well-known namespace." - } - ] -} diff --git a/app/lib/agentDiscovery.js b/app/lib/agentDiscovery.js new file mode 100644 index 0000000000..76a48842e6 --- /dev/null +++ b/app/lib/agentDiscovery.js @@ -0,0 +1,5 @@ +import agentDiscoveryContent from "./agentDiscoveryContent.cjs"; + +export function getAgentDiscoveryData() { + return agentDiscoveryContent.loadAgentDiscoveryMarkdown(); +} diff --git a/app/lib/agentDiscoveryContent.cjs b/app/lib/agentDiscoveryContent.cjs new file mode 100644 index 0000000000..d41fad869c --- /dev/null +++ b/app/lib/agentDiscoveryContent.cjs @@ -0,0 +1,175 @@ +const fs = require("fs"); +const path = require("path"); +const matter = require("gray-matter"); + +const DEFAULT_RELATIVE_PATH = "app/content/homepage/agent-discovery.md"; + +const SECTION_KEYS = { + "copy prompt": "copyPrompt", + "capability guidance": "capabilityGuidance", + "primary entrypoints": "primaryEntryPoints", + "human markdown mirrors": "humanMarkdownMirrors", + "machine-oriented indexes": "agentSectionIndexes", + "retrieval notes": "usageNotes", +}; + +function normalizeHeading(value = "") { + return String(value).trim().toLowerCase().replace(/\s+/g, " "); +} + +function collapseText(value = "") { + return String(value) + .split(/\n+/) + .map((line) => line.trim()) + .filter(Boolean) + .join(" "); +} + +function splitSections(markdown = "") { + const sections = { intro: [] }; + let currentKey = "intro"; + + String(markdown) + .split(/\r?\n/) + .forEach((line) => { + const headingMatch = line.match(/^##\s+(.+?)\s*$/); + if (headingMatch) { + const heading = normalizeHeading(headingMatch[1]); + currentKey = SECTION_KEYS[heading] || heading; + if (!sections[currentKey]) { + sections[currentKey] = []; + } + return; + } + + sections[currentKey].push(line); + }); + + return Object.fromEntries( + Object.entries(sections).map(([key, lines]) => [key, lines.join("\n").trim()]) + ); +} + +function requireString(value, fieldName, sourcePath) { + const normalized = String(value || "").trim(); + if (!normalized) { + throw new Error(`Missing ${fieldName} in ${sourcePath}`); + } + return normalized; +} + +function requireSection(sections, sectionKey, sourcePath) { + const content = String(sections[sectionKey] || "").trim(); + if (!content) { + throw new Error(`Missing ${sectionKey} section in ${sourcePath}`); + } + return content; +} + +function parseNoteItems(sectionContent, sectionName, sourcePath) { + const lines = String(sectionContent) + .split(/\r?\n/) + .map((line) => line.trim()) + .filter(Boolean); + + return lines.map((line) => { + const match = line.match(/^-\s+\*\*(.+?)\*\*\s+[—-]\s+(.+)$/); + if (!match) { + throw new Error(`Invalid note item in ${sectionName} (${sourcePath}): ${line}`); + } + + return { + title: match[1].trim(), + description: match[2].trim(), + }; + }); +} + +function parseLinkItems(sectionContent, sectionName, sourcePath) { + const lines = String(sectionContent) + .split(/\r?\n/) + .map((line) => line.trim()) + .filter(Boolean); + + return lines.map((line) => { + const match = line.match(/^-\s+\[(.+?)\]\(([^\s\)]+)\)\s+[—-]\s+(.+)$/); + if (!match) { + throw new Error(`Invalid link item in ${sectionName} (${sourcePath}): ${line}`); + } + + return { + href: match[2].trim(), + label: match[1].trim(), + description: match[3].trim(), + }; + }); +} + +function parseAgentDiscoveryMarkdown(rawContent, sourcePath = DEFAULT_RELATIVE_PATH) { + let parsed; + try { + parsed = matter(rawContent); + } catch (error) { + throw new Error(`Failed to parse frontmatter for ${sourcePath}: ${error.message}`); + } + + const sections = splitSections(parsed.content || ""); + const description = collapseText(sections.intro || ""); + const copyPrompt = collapseText(requireSection(sections, "copyPrompt", sourcePath)); + + const discoveryData = { + homepage: { + title: requireString(parsed.data.title, "title", sourcePath), + footerTitle: requireString(parsed.data.footer_title, "footer_title", sourcePath), + description: requireString(description, "intro description", sourcePath), + copyPrompt: requireString(copyPrompt, "copy prompt", sourcePath), + }, + capabilityGuidance: parseNoteItems( + requireSection(sections, "capabilityGuidance", sourcePath), + "Capability guidance", + sourcePath + ), + primaryEntryPoints: parseLinkItems( + requireSection(sections, "primaryEntryPoints", sourcePath), + "Primary entrypoints", + sourcePath + ), + humanMarkdownMirrors: parseLinkItems( + requireSection(sections, "humanMarkdownMirrors", sourcePath), + "Human markdown mirrors", + sourcePath + ), + agentSectionIndexes: parseLinkItems( + requireSection(sections, "agentSectionIndexes", sourcePath), + "Machine-oriented indexes", + sourcePath + ), + usageNotes: parseNoteItems( + requireSection(sections, "usageNotes", sourcePath), + "Retrieval notes", + sourcePath + ), + }; + + return discoveryData; +} + +function loadAgentDiscoveryMarkdown(options = {}) { + const baseDir = options.baseDir || process.cwd(); + const filePath = options.filePath || path.join(baseDir, DEFAULT_RELATIVE_PATH); + + let rawContent; + try { + rawContent = fs.readFileSync(filePath, "utf-8"); + } catch (error) { + throw new Error(`Failed to read agent discovery content at ${filePath}: ${error.message}`); + } + + return parseAgentDiscoveryMarkdown(rawContent, filePath); +} + +module.exports = { + DEFAULT_RELATIVE_PATH, + loadAgentDiscoveryMarkdown, + parseAgentDiscoveryMarkdown, +}; diff --git a/package.json b/package.json index 7f484b6934..15199e93ee 100644 --- a/package.json +++ b/package.json @@ -9,6 +9,7 @@ "build": "next build", "start": "next start", "lint": "eslint .", + "test:agent-discovery": "node scripts/test-agent-discovery-content.js", "test:umami-guard": "node scripts/test-umami-event-guard.js" }, "dependencies": { diff --git a/scripts/build-ai-legibility.js b/scripts/build-ai-legibility.js index 5120d1da92..9b89c45727 100644 --- a/scripts/build-ai-legibility.js +++ b/scripts/build-ai-legibility.js @@ -13,10 +13,11 @@ const fs = require("fs"); const path = require("path"); const { glob } = require("glob"); -const discoveryConfig = require("../app/lib/agent-discovery.json"); +const { loadAgentDiscoveryMarkdown } = require("../app/lib/agentDiscoveryContent.cjs"); const { llmsConfig } = require("./ai-legibility-config"); const { buildSummaryInfo, + isContentIndexExcluded, normalizeArray, normalizeSearchTerms, parseFrontMatter, @@ -47,6 +48,7 @@ const OUTPUT_AGENTS = path.join(process.cwd(), "public/agents.md"); const SUMMARY_RECOMMENDED_MAX = 280; const SUMMARY_ENFORCED_DIRS = ["blog/", "blurbs/", "overview/"]; const EXCLUDED_SECTIONS = new Set(["grants", "events", "singles"]); +const discoveryConfig = loadAgentDiscoveryMarkdown(); const loadConfigFrontMatter = (relativePath) => { const filePath = path.join(CONTENT_DIR, relativePath); @@ -196,6 +198,10 @@ const buildContentIndex = async () => { continue; } + if (isContentIndexExcluded(parsed.data)) { + continue; + } + const relativePath = path.relative(CONTENT_DIR, filePath).replace(/\\/g, "/"); const routeInfo = resolveSearchEntry(relativePath, blurbRoutes); if (!routeInfo) { diff --git a/scripts/build-search-index.js b/scripts/build-search-index.js index 976ae4e5ae..d483111d56 100644 --- a/scripts/build-search-index.js +++ b/scripts/build-search-index.js @@ -16,6 +16,7 @@ const { collectAuthors, collectFrontMatterValues, collectTags, + isSearchExcluded, normalizeSearchTerms, parseFrontMatter, parsePublishedTimestamp, @@ -96,6 +97,10 @@ async function buildSearchIndex() { continue; } + if (isSearchExcluded(parsed.data)) { + continue; + } + const routeInfo = resolveSearchEntry(relativePath, blurbRoutes); if (!routeInfo) { continue; diff --git a/scripts/lib/content-parse.js b/scripts/lib/content-parse.js index d19a4eddb4..0fbec836e6 100644 --- a/scripts/lib/content-parse.js +++ b/scripts/lib/content-parse.js @@ -8,6 +8,25 @@ const normalizeArray = (value) => { const uniqueStrings = (values) => Array.from(new Set(values.filter(Boolean))); +const isTruthyFrontMatterValue = (value) => + value === true || String(value || "").trim().toLowerCase() === "true"; + +const isSearchExcluded = (frontMatter = {}) => + [ + frontMatter.search_exclude, + frontMatter.searchExclude, + frontMatter.index_exclude, + frontMatter.indexExclude, + ].some(isTruthyFrontMatterValue); + +const isContentIndexExcluded = (frontMatter = {}) => + [ + frontMatter.content_index_exclude, + frontMatter.contentIndexExclude, + frontMatter.index_exclude, + frontMatter.indexExclude, + ].some(isTruthyFrontMatterValue); + const toTitleCase = (value = "") => String(value) .split("-") @@ -223,6 +242,8 @@ module.exports = { collectTags, extractFirstParagraph, extractMarkdownLinks, + isContentIndexExcluded, + isSearchExcluded, normalizeArray, normalizeHeadingIds, normalizeSearchTerms, diff --git a/scripts/test-agent-discovery-content.js b/scripts/test-agent-discovery-content.js new file mode 100644 index 0000000000..04fa58678e --- /dev/null +++ b/scripts/test-agent-discovery-content.js @@ -0,0 +1,50 @@ +#!/usr/bin/env node + +const assert = require("assert/strict"); + +const { + loadAgentDiscoveryMarkdown, + parseAgentDiscoveryMarkdown, +} = require("../app/lib/agentDiscoveryContent.cjs"); + +const data = loadAgentDiscoveryMarkdown(); + +assert.equal(data.homepage.title, "Instructions for Large Language Models"); +assert.equal(data.homepage.footerTitle, "instructions for ai agents"); +assert.equal( + data.homepage.description, + "Use these routes to discover urbit.org's machine-facing content without relying on the main navigation." +); +assert.ok(data.homepage.copyPrompt.includes("https://urbit.org/llms.txt")); + +assert.equal(data.capabilityGuidance.length, 2); +assert.equal(data.primaryEntryPoints.length, 5); +assert.equal(data.humanMarkdownMirrors.length, 4); +assert.equal(data.agentSectionIndexes.length, 7); +assert.equal(data.usageNotes.length, 4); + +assert.deepEqual(data.primaryEntryPoints[0], { + href: "/llms.txt", + label: "llms.txt", + description: "Canonical plain-text instructions for automated readers.", +}); +assert.equal(data.primaryEntryPoints[1].label, "For AI Agents"); +assert.equal(data.agentSectionIndexes[6].href, "/.agents/skills/index.md"); +assert.equal(data.usageNotes[3].title, "Well-known compatibility"); + +assert.throws( + () => + parseAgentDiscoveryMarkdown( + [ + "---", + "title: Incomplete", + "footer_title: incomplete", + "---", + "This file intentionally omits required sections.", + ].join("\n"), + "incomplete-agent-discovery.md" + ), + /Missing copyPrompt section/ +); + +console.log("✓ agent discovery markdown content parsed successfully");