Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 1 addition & 10 deletions cloud/app/lib/content/route-config.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@ export interface ContentRouteOptions<TMeta extends ContentMeta> {

/** Content type for Open Graph (defaults to "website") */
ogType?: "website" | "article";
/** Robots directive (e.g., "noindex, nofollow") */
robots?: string;
/** Function to generate social card image path from meta */
getImagePath?: (meta: TMeta) => string;
}
Expand Down Expand Up @@ -120,7 +118,6 @@ export function createContentRouteConfig<TMeta extends ContentMeta>(
head: createContentHead<TMeta>({
allMetas,
ogType: options.ogType,
robots: options.robots,
getImagePath: options.getImagePath,
}),

Expand Down Expand Up @@ -194,7 +191,6 @@ function isBlogMeta(meta: ContentMeta): meta is BlogMeta {
interface CreateContentHeadOptions<TMeta extends ContentMeta> {
allMetas: TMeta[];
ogType?: "website" | "article";
robots?: string;
getImagePath?: (meta: TMeta) => string;
}

Expand All @@ -209,7 +205,7 @@ interface CreateContentHeadOptions<TMeta extends ContentMeta> {
function createContentHead<TMeta extends ContentMeta>(
options: CreateContentHeadOptions<TMeta>,
) {
const { allMetas, ogType = "website", robots, getImagePath } = options;
const { allMetas, ogType = "website", getImagePath } = options;

return (ctx: {
match: { pathname: string };
Expand Down Expand Up @@ -245,11 +241,6 @@ function createContentHead<TMeta extends ContentMeta>(
{ name: "description", content: meta.description },
];

// Add robots if specified
if (robots) {
metaTags.push({ name: "robots", content: robots });
}

// Add Open Graph tags
metaTags.push(
...generateOpenGraphMeta({
Expand Down
221 changes: 221 additions & 0 deletions cloud/app/lib/robots.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
import { describe, it, expect } from "vitest";
import {
parseSitemapForUrlsWithoutChangefreq,
generateRobotsTxt,
} from "./robots";

describe("parseSitemapForUrlsWithoutChangefreq", () => {
it("returns empty array for empty sitemap", () => {
expect(parseSitemapForUrlsWithoutChangefreq("")).toEqual([]);
});

it("returns empty array for sitemap with no url blocks", () => {
const sitemap = `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
</urlset>`;
expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual([]);
});

it("extracts paths from URLs without changefreq", () => {
const sitemap = `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://example.com/page1</loc>
</url>
<url>
<loc>https://example.com/page2</loc>
</url>
</urlset>`;
expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual([
"/page1",
"/page2",
]);
});

it("excludes URLs that have changefreq", () => {
const sitemap = `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://example.com/static</loc>
</url>
<url>
<loc>https://example.com/dynamic</loc>
<changefreq>weekly</changefreq>
</url>
</urlset>`;
expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual(["/static"]);
});

it("handles changefreq with different values", () => {
const sitemap = `<?xml version="1.0" encoding="UTF-8"?>
<urlset>
<url>
<loc>https://example.com/daily</loc>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://example.com/monthly</loc>
<changefreq>monthly</changefreq>
</url>
<url>
<loc>https://example.com/never</loc>
<changefreq>never</changefreq>
</url>
<url>
<loc>https://example.com/no-freq</loc>
</url>
</urlset>`;
expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual(["/no-freq"]);
});

it("is case-insensitive for changefreq tag", () => {
const sitemap = `<?xml version="1.0" encoding="UTF-8"?>
<urlset>
<url>
<loc>https://example.com/upper</loc>
<CHANGEFREQ>weekly</CHANGEFREQ>
</url>
<url>
<loc>https://example.com/mixed</loc>
<ChangeFreq>weekly</ChangeFreq>
</url>
<url>
<loc>https://example.com/none</loc>
</url>
</urlset>`;
expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual(["/none"]);
});

it("excludes root path /", () => {
const sitemap = `<?xml version="1.0" encoding="UTF-8"?>
<urlset>
<url>
<loc>https://example.com/</loc>
</url>
<url>
<loc>https://example.com/page</loc>
</url>
</urlset>`;
expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual(["/page"]);
});

it("handles nested paths", () => {
const sitemap = `<?xml version="1.0" encoding="UTF-8"?>
<urlset>
<url>
<loc>https://example.com/docs/getting-started</loc>
</url>
<url>
<loc>https://example.com/blog/2024/post</loc>
</url>
</urlset>`;
expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual([
"/docs/getting-started",
"/blog/2024/post",
]);
});

it("throws on invalid URL in loc", () => {
const sitemap = `<?xml version="1.0" encoding="UTF-8"?>
<urlset>
<url>
<loc>not-a-valid-url</loc>
</url>
</urlset>`;
expect(() => parseSitemapForUrlsWithoutChangefreq(sitemap)).toThrow();
});

it("handles url blocks with other tags", () => {
const sitemap = `<?xml version="1.0" encoding="UTF-8"?>
<urlset>
<url>
<loc>https://example.com/with-priority</loc>
<priority>0.8</priority>
<lastmod>2024-01-01</lastmod>
</url>
<url>
<loc>https://example.com/with-freq</loc>
<priority>0.5</priority>
<changefreq>weekly</changefreq>
</url>
</urlset>`;
expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual([
"/with-priority",
]);
});

it("handles multiline url blocks", () => {
const sitemap = `<?xml version="1.0" encoding="UTF-8"?>
<urlset>
<url>
<loc>https://example.com/multiline</loc>
<priority>0.5</priority>
</url>
</urlset>`;
expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual([
"/multiline",
]);
});

it("skips url blocks without loc", () => {
const sitemap = `<?xml version="1.0" encoding="UTF-8"?>
<urlset>
<url>
<priority>0.5</priority>
</url>
<url>
<loc>https://example.com/valid</loc>
</url>
</urlset>`;
expect(parseSitemapForUrlsWithoutChangefreq(sitemap)).toEqual(["/valid"]);
});
});

describe("generateRobotsTxt", () => {
const siteUrl = "https://example.com";
const sitemapUrl = "https://example.com/sitemap.xml";

it("generates robots.txt with no disallow paths", () => {
const result = generateRobotsTxt([], siteUrl, sitemapUrl);
expect(result).toBe(`# robots.txt for https://example.com
User-agent: *
Allow: /

Sitemap: https://example.com/sitemap.xml`);
});

it("generates robots.txt with single disallow path", () => {
const result = generateRobotsTxt(["/private"], siteUrl, sitemapUrl);
expect(result).toBe(`# robots.txt for https://example.com
User-agent: *
Allow: /
Disallow: /private

Sitemap: https://example.com/sitemap.xml`);
});

it("generates robots.txt with multiple disallow paths", () => {
const result = generateRobotsTxt(
["/private", "/admin", "/internal"],
siteUrl,
sitemapUrl,
);
expect(result).toBe(`# robots.txt for https://example.com
User-agent: *
Allow: /
Disallow: /private
Disallow: /admin
Disallow: /internal

Sitemap: https://example.com/sitemap.xml`);
});

it("uses provided sitemap URL", () => {
const result = generateRobotsTxt(
[],
"https://mirascope.com",
"https://mirascope.com/sitemap.xml",
);
expect(result).toContain("Sitemap: https://mirascope.com/sitemap.xml");
});
});
42 changes: 42 additions & 0 deletions cloud/app/lib/robots.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/**
* Pure functions for robots.txt generation from sitemap data
*/

/**
* Parse sitemap XML and extract URLs that don't have a changefreq tag.
* URLs without changefreq are considered low-priority for crawling.
* The root path "/" is excluded as changefreq doesn't apply globally.
*/
export function parseSitemapForUrlsWithoutChangefreq(
sitemapXml: string,
): string[] {
const urlMatches = sitemapXml.match(/<url>[\s\S]*?<\/url>/g) ?? [];

return (
urlMatches
.filter((urlBlock) => !/<changefreq>.*?<\/changefreq>/i.test(urlBlock))
.map((urlBlock) => urlBlock.match(/<loc>(.*?)<\/loc>/)?.[1])
.filter((loc): loc is string => loc !== undefined)
.map((loc) => new URL(loc).pathname)
// exclude root path
.filter((pathname) => pathname !== "/")
);
}

/**
* Generate robots.txt content from a list of disallow paths
*/
export function generateRobotsTxt(
disallowPaths: string[],
siteUrl: string,
sitemapUrl: string,
): string {
const baseRules = `# robots.txt for ${siteUrl}
User-agent: *
Allow: /
`;

const disallowRules = disallowPaths.map((p) => `Disallow: ${p}`).join("\n");

return `${baseRules}${disallowRules ? `${disallowRules}\n` : ""}\nSitemap: ${sitemapUrl}`;
}
21 changes: 1 addition & 20 deletions cloud/app/lib/seo/head.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -296,9 +296,6 @@ describe("createPageHead", () => {
content: "summary_large_image",
});

// No robots
expect(findMeta(result.meta, "robots")).toBeUndefined();

// No scripts (not an article)
expect(result.scripts).toBeUndefined();
});
Expand Down Expand Up @@ -446,23 +443,7 @@ describe("createPageHead", () => {
});
});

describe("Test Case 7: With robots noindex", () => {
it("includes robots meta tag", () => {
const result = createPageHead({
route: "/dev/tools",
title: "Dev Tools",
description: "Development tools",
robots: "noindex, nofollow",
});

expect(findMeta(result.meta, "robots")).toEqual({
name: "robots",
content: "noindex, nofollow",
});
});
});

describe("Test Case 8: Empty description", () => {
describe("Test Case 7: Empty description", () => {
it("handles empty description", () => {
const result = createPageHead({
route: "/empty-desc",
Expand Down
10 changes: 1 addition & 9 deletions cloud/app/lib/seo/head.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import { BASE_URL } from "@/app/lib/site";

/**
* Head metadata entry for routes.
* Supports title, name-based meta tags (e.g., description, robots),
* Supports title, name-based meta tags (e.g., description),
* property-based meta tags (e.g., og:title, twitter:card), and charset.
*/
export type HeadMetaEntry =
Expand Down Expand Up @@ -70,8 +70,6 @@ export interface PageHeadOptions {
ogType?: "website" | "article";
/** Custom image path or URL for social cards */
image?: string;
/** Robots directive (e.g., "noindex, nofollow") */
robots?: string;
/** Article metadata for blog posts */
article?: ArticleMeta;
}
Expand Down Expand Up @@ -252,7 +250,6 @@ export function createPageHead(options: PageHeadOptions): HeadResult {
description,
ogType = "website",
image,
robots,
article,
} = options;

Expand All @@ -273,11 +270,6 @@ export function createPageHead(options: PageHeadOptions): HeadResult {
{ name: "description", content: description },
];

// Add robots if specified
if (robots) {
metaTags.push({ name: "robots", content: robots });
}

// Add Open Graph tags
metaTags.push(
...generateOpenGraphMeta({
Expand Down
6 changes: 4 additions & 2 deletions cloud/public/robots.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# https://www.robotstxt.org/robotstxt.html
# robots.txt for development (overwritten by vite build)
User-agent: *
Disallow: /
Allow: /

Sitemap: https://mirascope.com/sitemap.xml
Loading