diff --git a/cloud/app/lib/content/route-config.tsx b/cloud/app/lib/content/route-config.tsx index c9917fd991..c3e14f5f9e 100644 --- a/cloud/app/lib/content/route-config.tsx +++ b/cloud/app/lib/content/route-config.tsx @@ -76,8 +76,6 @@ export interface ContentRouteOptions { /** Content type for Open Graph (defaults to "website") */ ogType?: "website" | "article"; - /** Robots directive (e.g., "noindex, nofollow") */ - robots?: string; /** Function to generate social card image path from meta */ getImagePath?: (meta: TMeta) => string; } @@ -120,7 +118,6 @@ export function createContentRouteConfig( head: createContentHead({ allMetas, ogType: options.ogType, - robots: options.robots, getImagePath: options.getImagePath, }), @@ -194,7 +191,6 @@ function isBlogMeta(meta: ContentMeta): meta is BlogMeta { interface CreateContentHeadOptions { allMetas: TMeta[]; ogType?: "website" | "article"; - robots?: string; getImagePath?: (meta: TMeta) => string; } @@ -209,7 +205,7 @@ interface CreateContentHeadOptions { function createContentHead( options: CreateContentHeadOptions, ) { - const { allMetas, ogType = "website", robots, getImagePath } = options; + const { allMetas, ogType = "website", getImagePath } = options; return (ctx: { match: { pathname: string }; @@ -245,11 +241,6 @@ function createContentHead( { name: "description", content: meta.description }, ]; - // Add robots if specified - if (robots) { - metaTags.push({ name: "robots", content: robots }); - } - // Add Open Graph tags metaTags.push( ...generateOpenGraphMeta({ diff --git a/cloud/app/lib/robots.test.ts b/cloud/app/lib/robots.test.ts new file mode 100644 index 0000000000..87cb31cc92 --- /dev/null +++ b/cloud/app/lib/robots.test.ts @@ -0,0 +1,221 @@ +import { describe, it, expect } from "vitest"; +import { + parseSitemapForUrlsWithoutChangefreq, + generateRobotsTxt, +} from "./robots"; + +describe("parseSitemapForUrlsWithoutChangefreq", () => { + it("returns empty array for empty sitemap", () => { + expect(parseSitemapForUrlsWithoutChangefreq("")).toEqual([]); + }); + + it("returns empty array for sitemap with no url blocks", () => { + const sitemap = ` + +`; + expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual([]); + }); + + it("extracts paths from URLs without changefreq", () => { + const sitemap = ` + + + https://example.com/page1 + + + https://example.com/page2 + +`; + expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual([ + "/page1", + "/page2", + ]); + }); + + it("excludes URLs that have changefreq", () => { + const sitemap = ` + + + https://example.com/static + + + https://example.com/dynamic + weekly + +`; + expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual(["/static"]); + }); + + it("handles changefreq with different values", () => { + const sitemap = ` + + + https://example.com/daily + daily + + + https://example.com/monthly + monthly + + + https://example.com/never + never + + + https://example.com/no-freq + +`; + expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual(["/no-freq"]); + }); + + it("is case-insensitive for changefreq tag", () => { + const sitemap = ` + + + https://example.com/upper + weekly + + + https://example.com/mixed + weekly + + + https://example.com/none + +`; + expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual(["/none"]); + }); + + it("excludes root path /", () => { + const sitemap = ` + + + https://example.com/ + + + https://example.com/page + +`; + expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual(["/page"]); + }); + + it("handles nested paths", () => { + const sitemap = ` + + + https://example.com/docs/getting-started + + + https://example.com/blog/2024/post + +`; + expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual([ + "/docs/getting-started", + "/blog/2024/post", + ]); + }); + + it("throws on invalid URL in loc", () => { + const sitemap = ` + + + not-a-valid-url + +`; + expect(() => parseSitemapForUrlsWithoutChangefreq(sitemap)).toThrow(); + }); + + it("handles url blocks with other tags", () => { + const sitemap = ` + + + https://example.com/with-priority + 0.8 + 2024-01-01 + + + https://example.com/with-freq + 0.5 + weekly + +`; + expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual([ + "/with-priority", + ]); + }); + + it("handles multiline url blocks", () => { + const sitemap = ` + + + https://example.com/multiline + 0.5 + +`; + expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual([ + "/multiline", + ]); + }); + + it("skips url blocks without loc", () => { + const sitemap = ` + + + 0.5 + + + https://example.com/valid + +`; + expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual(["/valid"]); + }); +}); + +describe("generateRobotsTxt", () => { + const siteUrl = "https://example.com"; + const sitemapUrl = "https://example.com/sitemap.xml"; + + it("generates robots.txt with no disallow paths", () => { + const result = generateRobotsTxt([], siteUrl, sitemapUrl); + expect(result).toBe(`# robots.txt for https://example.com +User-agent: * +Allow: / + +Sitemap: https://example.com/sitemap.xml`); + }); + + it("generates robots.txt with single disallow path", () => { + const result = generateRobotsTxt(["/private"], siteUrl, sitemapUrl); + expect(result).toBe(`# robots.txt for https://example.com +User-agent: * +Allow: / +Disallow: /private + +Sitemap: https://example.com/sitemap.xml`); + }); + + it("generates robots.txt with multiple disallow paths", () => { + const result = generateRobotsTxt( + ["/private", "/admin", "/internal"], + siteUrl, + sitemapUrl, + ); + expect(result).toBe(`# robots.txt for https://example.com +User-agent: * +Allow: / +Disallow: /private +Disallow: /admin +Disallow: /internal + +Sitemap: https://example.com/sitemap.xml`); + }); + + it("uses provided sitemap URL", () => { + const result = generateRobotsTxt( + [], + "https://mirascope.com", + "https://mirascope.com/sitemap.xml", + ); + expect(result).toContain("Sitemap: https://mirascope.com/sitemap.xml"); + }); +}); diff --git a/cloud/app/lib/robots.ts b/cloud/app/lib/robots.ts new file mode 100644 index 0000000000..0c8eaf5286 --- /dev/null +++ b/cloud/app/lib/robots.ts @@ -0,0 +1,42 @@ +/** + * Pure functions for robots.txt generation from sitemap data + */ + +/** + * Parse sitemap XML and extract URLs that don't have a changefreq tag. + * URLs without changefreq are considered low-priority for crawling. + * The root path "/" is excluded as changefreq doesn't apply globally. + */ +export function parseSitemapForUrlsWithoutChangefreq( + sitemapXml: string, +): string[] { + const urlMatches = sitemapXml.match(/[\s\S]*?<\/url>/g) ?? []; + + return ( + urlMatches + .filter((urlBlock) => !/.*?<\/changefreq>/i.test(urlBlock)) + .map((urlBlock) => urlBlock.match(/(.*?)<\/loc>/)?.[1]) + .filter((loc): loc is string => loc !== undefined) + .map((loc) => new URL(loc).pathname) + // exclude root path + .filter((pathname) => pathname !== "/") + ); +} + +/** + * Generate robots.txt content from a list of disallow paths + */ +export function generateRobotsTxt( + disallowPaths: string[], + siteUrl: string, + sitemapUrl: string, +): string { + const baseRules = `# robots.txt for ${siteUrl} +User-agent: * +Allow: / +`; + + const disallowRules = disallowPaths.map((p) => `Disallow: ${p}`).join("\n"); + + return `${baseRules}${disallowRules ? `${disallowRules}\n` : ""}\nSitemap: ${sitemapUrl}`; +} diff --git a/cloud/app/lib/seo/head.test.ts b/cloud/app/lib/seo/head.test.ts index 32026f4bdf..d8a1d775cf 100644 --- a/cloud/app/lib/seo/head.test.ts +++ b/cloud/app/lib/seo/head.test.ts @@ -296,9 +296,6 @@ describe("createPageHead", () => { content: "summary_large_image", }); - // No robots - expect(findMeta(result.meta, "robots")).toBeUndefined(); - // No scripts (not an article) expect(result.scripts).toBeUndefined(); }); @@ -446,23 +443,7 @@ describe("createPageHead", () => { }); }); - describe("Test Case 7: With robots noindex", () => { - it("includes robots meta tag", () => { - const result = createPageHead({ - route: "/dev/tools", - title: "Dev Tools", - description: "Development tools", - robots: "noindex, nofollow", - }); - - expect(findMeta(result.meta, "robots")).toEqual({ - name: "robots", - content: "noindex, nofollow", - }); - }); - }); - - describe("Test Case 8: Empty description", () => { + describe("Test Case 7: Empty description", () => { it("handles empty description", () => { const result = createPageHead({ route: "/empty-desc", diff --git a/cloud/app/lib/seo/head.ts b/cloud/app/lib/seo/head.ts index cc058c0f10..1887902642 100644 --- a/cloud/app/lib/seo/head.ts +++ b/cloud/app/lib/seo/head.ts @@ -11,7 +11,7 @@ import { BASE_URL } from "@/app/lib/site"; /** * Head metadata entry for routes. - * Supports title, name-based meta tags (e.g., description, robots), + * Supports title, name-based meta tags (e.g., description), * property-based meta tags (e.g., og:title, twitter:card), and charset. */ export type HeadMetaEntry = @@ -70,8 +70,6 @@ export interface PageHeadOptions { ogType?: "website" | "article"; /** Custom image path or URL for social cards */ image?: string; - /** Robots directive (e.g., "noindex, nofollow") */ - robots?: string; /** Article metadata for blog posts */ article?: ArticleMeta; } @@ -252,7 +250,6 @@ export function createPageHead(options: PageHeadOptions): HeadResult { description, ogType = "website", image, - robots, article, } = options; @@ -273,11 +270,6 @@ export function createPageHead(options: PageHeadOptions): HeadResult { { name: "description", content: description }, ]; - // Add robots if specified - if (robots) { - metaTags.push({ name: "robots", content: robots }); - } - // Add Open Graph tags metaTags.push( ...generateOpenGraphMeta({ diff --git a/cloud/public/robots.txt b/cloud/public/robots.txt index b21f0887ac..1ae660870d 100644 --- a/cloud/public/robots.txt +++ b/cloud/public/robots.txt @@ -1,3 +1,5 @@ -# https://www.robotstxt.org/robotstxt.html +# robots.txt for development (overwritten by vite build) User-agent: * -Disallow: / +Allow: / + +Sitemap: https://mirascope.com/sitemap.xml diff --git a/cloud/vite-plugins/README.md b/cloud/vite-plugins/README.md index e995158078..b016ce8fcf 100644 --- a/cloud/vite-plugins/README.md +++ b/cloud/vite-plugins/README.md @@ -61,6 +61,7 @@ readTime: "5 min read" ### Type Safety TypeScript types are provided in: + - `app/types/mdx.d.ts` - types for MDX imports ## Content Plugin (`content.ts`) @@ -93,6 +94,7 @@ blogPosts.forEach(post => { ### Content Types The plugin recognizes content types based on directory structure: + - `content/blog/` → type: "blog" - `content/docs/` → type: "docs" - `content/policy/` → type: "policy" @@ -101,6 +103,7 @@ The plugin recognizes content types based on directory structure: ### Blog Metadata Blog posts include additional fields: + - `date`: Publication date - `author`: Author name - `readTime`: Estimated reading time @@ -116,6 +119,7 @@ Blog posts include additional fields: ### Type Safety TypeScript types are provided in: + - `app/types/virtual-content-meta.d.ts` - types for the virtual module - `app/lib/content/types.ts` - `ContentMeta` and `BlogMeta` interfaces @@ -172,3 +176,56 @@ The plugin looks for source images in `public/` and processes them on-the-fly du After the build completes, the plugin scans `dist/client/assets/` and **fails the build** if any `.png`, `.jpg`, or `.jpeg` files are found. This ensures all images are WebP format in production. SVG and GIF files are allowed (they're valid non-raster or animated formats). + +## Robots Plugin (`robots.ts`) + +Generates a production `robots.txt` from the sitemap, disallowing low-priority URLs. The sitemap is generated during prerendering and only included entries have a `changefreq` tag. Entries without `changefreq` will be disallowed. + +### Features + +- **Sitemap-driven**: Reads the generated `sitemap.xml` after build +- **Selective disallow**: URLs without a `` tag are disallowed (not included in prerender, considered low-priority) +- **Post-build generation**: Runs after sitemap generation to ensure sitemap exists +- **Environment-aware**: Different behavior for development vs production + +### How It Works + +1. After the build completes, the plugin reads `dist/client/sitemap.xml` +2. It parses all `` entries and identifies those without a `` tag +3. URLs without `changefreq` are added as `Disallow` rules in robots.txt +4. The generated `robots.txt` is written to `dist/client/robots.txt` + +The logic assumes that URLs with `changefreq` are high-value pages (docs, blog posts) that should be crawled, while URLs without it (app routes, utility pages) should be excluded from search engines. + +### Build-time vs Runtime + +- **Development**: `public/robots.txt` is served directly (permissive, allows all) +- **Production**: `dist/client/robots.txt` is generated with selective disallow rules + +### Generated Output Example + +```txt +# robots.txt for https://mirascope.com +User-agent: * +Allow: / +Disallow: /dashboard +Disallow: /settings +Disallow: /api/health + +Sitemap: https://mirascope.com/sitemap.xml +``` + +### Dependencies + +The plugin uses helper functions from `app/lib/robots.ts`: + +- `parseSitemapForUrlsWithoutChangefreq()` - Extracts URLs without changefreq from sitemap XML +- `generateRobotsTxt()` - Generates the robots.txt content string + +### Error Handling + +The plugin will **fail the build** if: + +- The sitemap file does not exist at `dist/client/sitemap.xml` + +This ensures the sitemap plugin runs before the robots plugin. diff --git a/cloud/vite-plugins/robots.ts b/cloud/vite-plugins/robots.ts new file mode 100644 index 0000000000..027143d671 --- /dev/null +++ b/cloud/vite-plugins/robots.ts @@ -0,0 +1,71 @@ +/** + * Vite plugin for generating robots.txt from sitemap + * + * Generates a production robots.txt from the sitemap, disallowing low-priority URLs. + * The sitemap is generated during prerendering and only included entries have a + * changefreq. Entries without changefreq will be disallowed. + * + * This plugin reads the generated sitemap.xml after build and extracts URLs that + * don't have a changefreq tag at all, then generates a static robots.txt file in + * dist/client/ that disallows those URLs. + * + * In development, public/robots.txt is served (allow all). + * In production, dist/client/robots.txt (generated) is served. + */ + +import type { Plugin } from "vite"; +import fs from "node:fs/promises"; +import path from "node:path"; +import { + parseSitemapForUrlsWithoutChangefreq, + generateRobotsTxt, +} from "../app/lib/robots"; +import { BASE_URL } from "../app/lib/site"; + +const SITEMAP_URL = `${BASE_URL}/sitemap.xml`; + +export function viteRobots(): Plugin { + return { + name: "vite-plugin-robots", + enforce: "post", + + buildApp: { + order: "post", + handler: async (builder) => { + const clientOutDir = + builder.environments["client"]?.config.build.outDir ?? + builder.config.build.outDir; + const sitemapPath = path.resolve( + process.cwd(), + clientOutDir, + "sitemap.xml", + ); + const outputPath = path.resolve( + process.cwd(), + clientOutDir, + "robots.txt", + ); + + // Fail if sitemap does not exist + try { + await fs.access(sitemapPath); + } catch { + throw new Error(`[robots] Sitemap not found at ${sitemapPath}`); + } + + const sitemapXml = await fs.readFile(sitemapPath, "utf-8"); + const disallowPaths = parseSitemapForUrlsWithoutChangefreq(sitemapXml); + const robotsContent = generateRobotsTxt( + disallowPaths, + BASE_URL, + SITEMAP_URL, + ); + + await fs.writeFile(outputPath, robotsContent); + console.log( + `[robots] Generated robots.txt with ${disallowPaths.length} disallow paths`, + ); + }, + }, + }; +} diff --git a/cloud/vite.config.ts b/cloud/vite.config.ts index f601291c79..aa374d0cff 100644 --- a/cloud/vite.config.ts +++ b/cloud/vite.config.ts @@ -6,6 +6,7 @@ import path from "path"; import { viteMDX } from "./vite-plugins/mdx"; import { viteContent } from "./vite-plugins/content"; import { viteImages } from "./vite-plugins/images"; +import { viteRobots } from "./vite-plugins/robots"; import { defineConfig } from "vite"; export default defineConfig(() => { @@ -32,7 +33,10 @@ export default defineConfig(() => { retryDelay: 0, maxRedirects: 5, failOnError: true, + // for now, pages not included in prerendering will be disallowed in robots.txt filter: (page: { path: string }) => + page.path.startsWith("/home") || + page.path.startsWith("/pricing") || page.path.startsWith("/docs") || page.path.startsWith("/blog") || page.path.startsWith("/terms") || @@ -51,6 +55,7 @@ export default defineConfig(() => { host: "https://mirascope.com", }, }), + viteRobots(), // Generate robots disallow paths from sitemap (must be after tanstackStart) viteReact(), tailwindcss(), ],