diff --git a/src/core/ai-index.test.ts b/src/core/ai-index.test.ts index 8fa95a1..fecc26a 100644 --- a/src/core/ai-index.test.ts +++ b/src/core/ai-index.test.ts @@ -30,6 +30,10 @@ const baseConfig: ResolvedAeoConfig = { aiIndex: true, schema: true, }, + aiIndex: { + maxChunkLength: 2000, + maxKeywords: 10, + }, robots: { allow: ['/'], disallow: [], crawlDelay: 0, sitemap: '' }, widget: { enabled: true, @@ -137,6 +141,59 @@ describe('generateAIIndex', () => { expect(entry?.keywords).not.toContain('ux'); }); + it('should use configured max chunk length', () => { + const config: ResolvedAeoConfig = { + ...baseConfig, + aiIndex: { + ...baseConfig.aiIndex, + maxChunkLength: 20, + }, + pages: [ + { + pathname: '/chunked', + title: 'Chunked', + content: [ + 'First paragraph content.', + 'Second paragraph content.', + 'Third paragraph content.', + ].join('\n\n'), + }, + ], + }; + + const result = generateAIIndex(config); + const index = JSON.parse(result); + const entries = index.entries + .filter((e: any) => e.url === 'https://example.com/chunked') + .sort((a: any, b: any) => a.metadata.chunkIndex - b.metadata.chunkIndex); + + expect(entries).toHaveLength(3); + expect(entries.map((entry: any) => entry.metadata.chunkIndex)).toEqual([0, 1, 2]); + }); + + it('should use configured max keywords', () => { + const config: ResolvedAeoConfig = { + ...baseConfig, + aiIndex: { + ...baseConfig.aiIndex, + maxKeywords: 2, + }, + pages: [ + { + pathname: '/keywords', + title: 'Keywords', + content: 'alpha alpha alpha beta beta gamma delta epsilon', + }, + ], + }; + + const result = generateAIIndex(config); + const index = JSON.parse(result); + const entry = index.entries.find((e: any) => e.url === 'https://example.com/keywords'); + + expect(entry?.keywords).toEqual(['alpha', 'beta']); + }); + it('should handle pages without content', () => { const result = generateAIIndex(baseConfig); const index = JSON.parse(result); diff --git a/src/core/ai-index.ts b/src/core/ai-index.ts index b455993..4e17963 100644 --- a/src/core/ai-index.ts +++ b/src/core/ai-index.ts @@ -4,7 +4,9 @@ import { createHash } from 'crypto'; import type { ResolvedAeoConfig, AIIndexEntry } from '../types'; import { parseFrontmatter, extractTitle } from './utils'; -function extractKeywords(content: string): string[] { +function extractKeywords(content: string, maxKeywords: number): string[] { + if (maxKeywords < 1) return []; + const words = content .normalize('NFC') .toLowerCase() @@ -22,11 +24,11 @@ function extractKeywords(content: string): string[] { return Object.entries(wordCount) .sort((a, b) => b[1] - a[1]) - .slice(0, 10) + .slice(0, maxKeywords) .map(([word]) => word); } -function chunkContent(content: string, maxLength: number = 2000): string[] { +function chunkContent(content: string, maxLength: number): string[] { const chunks: string[] = []; const paragraphs = content.split('\n\n'); @@ -66,9 +68,9 @@ function collectAIIndexEntries(dir: string, config: ResolvedAeoConfig, base: str const urlPath = relativePath.replace(/\.mdx?$/, ''); const url = `${config.url}/${urlPath}`; - const chunks = chunkContent(mainContent); + const chunks = chunkContent(mainContent, config.aiIndex.maxChunkLength); const title = frontmatter.title || extractTitle(mainContent); - const keywords = extractKeywords(mainContent); + const keywords = extractKeywords(mainContent, config.aiIndex.maxKeywords); chunks.forEach((chunk, index) => { const id = createHash('sha256') @@ -115,8 +117,8 @@ export function generateAIIndex(config: ResolvedAeoConfig): string { const content = page.content || ''; if (content) { - const chunks = chunkContent(content); - const keywords = extractKeywords(content); + const chunks = chunkContent(content, config.aiIndex.maxChunkLength); + const keywords = extractKeywords(content, config.aiIndex.maxKeywords); chunks.forEach((chunk, index) => { const id = createHash('sha256') diff --git a/src/core/audit.test.ts b/src/core/audit.test.ts index 8c3e3c0..d0b78de 100644 --- a/src/core/audit.test.ts +++ b/src/core/audit.test.ts @@ -11,6 +11,7 @@ function makeConfig(overrides: Partial = {}): ResolvedAeoConf outDir: './out', contentDir: '', generators: { robotsTxt: true, llmsTxt: true, llmsFullTxt: true, rawMarkdown: true, manifest: true, sitemap: true, aiIndex: true, schema: true }, + aiIndex: { maxChunkLength: 2000, maxKeywords: 10 }, robots: { allow: ['/'], disallow: [], crawlDelay: 0, sitemap: '/sitemap.xml' }, schema: { enabled: true, diff --git a/src/core/generate-wrapper.test.ts b/src/core/generate-wrapper.test.ts index 1804baf..beb6efd 100644 --- a/src/core/generate-wrapper.test.ts +++ b/src/core/generate-wrapper.test.ts @@ -37,6 +37,10 @@ const baseConfig: ResolvedAeoConfig = { aiIndex: true, schema: true, }, + aiIndex: { + maxChunkLength: 2000, + maxKeywords: 10, + }, robots: { allow: ['/'], disallow: [], crawlDelay: 0, sitemap: '' }, widget: { enabled: true, diff --git a/src/core/llms-full.test.ts b/src/core/llms-full.test.ts index d601ed7..7d5055c 100644 --- a/src/core/llms-full.test.ts +++ b/src/core/llms-full.test.ts @@ -26,6 +26,10 @@ const baseConfig: ResolvedAeoConfig = { aiIndex: true, schema: true, }, + aiIndex: { + maxChunkLength: 2000, + maxKeywords: 10, + }, robots: { allow: ['/'], disallow: [], crawlDelay: 0, sitemap: '' }, widget: { enabled: true, diff --git a/src/core/llms-txt.test.ts b/src/core/llms-txt.test.ts index ea744a8..650cb43 100644 --- a/src/core/llms-txt.test.ts +++ b/src/core/llms-txt.test.ts @@ -26,6 +26,10 @@ const baseConfig: ResolvedAeoConfig = { aiIndex: true, schema: true, }, + aiIndex: { + maxChunkLength: 2000, + maxKeywords: 10, + }, robots: { allow: ['/'], disallow: [], crawlDelay: 0, sitemap: '' }, widget: { enabled: true, diff --git a/src/core/manifest.test.ts b/src/core/manifest.test.ts index 0f7e176..391e973 100644 --- a/src/core/manifest.test.ts +++ b/src/core/manifest.test.ts @@ -31,6 +31,10 @@ const baseConfig: ResolvedAeoConfig = { aiIndex: true, schema: true, }, + aiIndex: { + maxChunkLength: 2000, + maxKeywords: 10, + }, robots: { allow: ['/'], disallow: [], crawlDelay: 0, sitemap: '' }, widget: { enabled: true, diff --git a/src/core/raw-markdown.test.ts b/src/core/raw-markdown.test.ts index daf2740..ef37b64 100644 --- a/src/core/raw-markdown.test.ts +++ b/src/core/raw-markdown.test.ts @@ -48,6 +48,10 @@ const createConfig = (overrides = {}): ResolvedAeoConfig => ({ aiIndex: true, schema: true, }, + aiIndex: { + maxChunkLength: 2000, + maxKeywords: 10, + }, robots: { allow: ['/'], disallow: [], crawlDelay: 0, sitemap: '' }, widget: { enabled: true, diff --git a/src/core/report.test.ts b/src/core/report.test.ts index fd87771..d9e8a1d 100644 --- a/src/core/report.test.ts +++ b/src/core/report.test.ts @@ -14,6 +14,7 @@ function makeConfig(): ResolvedAeoConfig { outDir: './out', contentDir: '', generators: { robotsTxt: true, llmsTxt: true, llmsFullTxt: true, rawMarkdown: true, manifest: true, sitemap: true, aiIndex: true, schema: true }, + aiIndex: { maxChunkLength: 2000, maxKeywords: 10 }, robots: { allow: ['/'], disallow: [], crawlDelay: 0, sitemap: '/sitemap.xml' }, schema: { enabled: true, organization: { name: 'Test Co', url: 'https://test.com', logo: '', sameAs: [] }, defaultType: 'WebPage' }, og: { enabled: true, image: '', twitterHandle: '', type: 'website' }, diff --git a/src/core/robots.test.ts b/src/core/robots.test.ts index 0273693..31152ba 100644 --- a/src/core/robots.test.ts +++ b/src/core/robots.test.ts @@ -20,6 +20,10 @@ describe('generateRobotsTxt', () => { aiIndex: true, schema: true, }, + aiIndex: { + maxChunkLength: 2000, + maxKeywords: 10, + }, robots: { allow: ['/'], disallow: [], crawlDelay: 0, sitemap: '' }, widget: { enabled: true, @@ -104,4 +108,4 @@ describe('generateRobotsTxt', () => { expect(bingbotMatches.length).toBe(1) expect(semrushMatches.length).toBe(1) }) -}) \ No newline at end of file +}) diff --git a/src/core/sitemap.test.ts b/src/core/sitemap.test.ts index 2feec43..caf3422 100644 --- a/src/core/sitemap.test.ts +++ b/src/core/sitemap.test.ts @@ -50,6 +50,10 @@ describe('generateSitemap', () => { aiIndex: true, schema: true, }, + aiIndex: { + maxChunkLength: 2000, + maxKeywords: 10, + }, robots: { allow: ['/'], disallow: [], crawlDelay: 0, sitemap: '' }, widget: { enabled: true, @@ -281,4 +285,4 @@ describe('generateSitemap', () => { expect(sitemap).toContain('https://example.com/docs/guide'); expect(sitemap).toContain('https://example.com/docs/api/reference'); }); -}); \ No newline at end of file +}); diff --git a/src/core/utils.test.ts b/src/core/utils.test.ts index 62c1a48..c1c4d19 100644 --- a/src/core/utils.test.ts +++ b/src/core/utils.test.ts @@ -24,6 +24,10 @@ describe('utils', () => { expect(result.url).toBe('https://example.com'); expect(result.generators.robotsTxt).toBe(true); expect(result.generators.llmsTxt).toBe(true); + expect(result.aiIndex).toEqual({ + maxChunkLength: 2000, + maxKeywords: 10, + }); expect(result.widget.enabled).toBe(true); expect(result.widget.position).toBe('bottom-right'); }); @@ -54,6 +58,17 @@ describe('utils', () => { expect(result.widget.theme.background).toBe('rgba(18, 18, 24, 0.9)'); }); + it('should handle partial aiIndex config', () => { + const result = resolveConfig({ + aiIndex: { + maxKeywords: 5, + }, + }); + + expect(result.aiIndex.maxKeywords).toBe(5); + expect(result.aiIndex.maxChunkLength).toBe(2000); + }); + it('should resolve robots config', () => { const result = resolveConfig({ robots: { disallow: ['/admin'], crawlDelay: 5 }, diff --git a/src/core/utils.ts b/src/core/utils.ts index e2527c0..5995ad6 100644 --- a/src/core/utils.ts +++ b/src/core/utils.ts @@ -54,6 +54,10 @@ export function resolveConfig(config: AeoConfig = {}): ResolvedAeoConfig { aiIndex: config.generators?.aiIndex !== false, schema: config.generators?.schema !== false, }, + aiIndex: { + maxChunkLength: config.aiIndex?.maxChunkLength ?? 2000, + maxKeywords: config.aiIndex?.maxKeywords ?? 10, + }, robots: { allow: config.robots?.allow || ['/'], disallow: config.robots?.disallow || [], @@ -199,4 +203,4 @@ export function getAllMarkdownFiles( scanDirectory(projectRoot); return files; -} \ No newline at end of file +} diff --git a/src/types.ts b/src/types.ts index fb64d64..c8c35ec 100644 --- a/src/types.ts +++ b/src/types.ts @@ -22,6 +22,10 @@ export interface AeoConfig { aiIndex?: boolean; schema?: boolean; }; + aiIndex?: { + maxChunkLength?: number; + maxKeywords?: number; + }; robots?: { allow?: string[]; disallow?: string[]; @@ -77,6 +81,10 @@ export interface ResolvedAeoConfig { aiIndex: boolean; schema: boolean; }; + aiIndex: { + maxChunkLength: number; + maxKeywords: number; + }; robots: { allow: string[]; disallow: string[]; @@ -172,4 +180,4 @@ export interface FrameworkInfo { framework: FrameworkType; contentDir: string; outDir: string; -} \ No newline at end of file +} diff --git a/website/src/content/docs/reference/configuration.mdx b/website/src/content/docs/reference/configuration.mdx index 473862c..7511872 100644 --- a/website/src/content/docs/reference/configuration.mdx +++ b/website/src/content/docs/reference/configuration.mdx @@ -32,6 +32,12 @@ export default defineConfig({ schema: true, }, + // Configure ai-index.json generation + aiIndex: { + maxChunkLength: 2000, + maxKeywords: 10, + }, + // Customize robots.txt robots: { allow: ['/'], @@ -103,6 +109,13 @@ export default defineConfig({ | `aiIndex` | `boolean` | `true` | Generate `ai-index.json` | | `schema` | `boolean` | `false` | Generate JSON-LD structured data | +### `aiIndex` + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `maxChunkLength` | `number` | `2000` | Target maximum content length per `ai-index.json` chunk; chunks split on paragraph boundaries | +| `maxKeywords` | `number` | `10` | Maximum keywords extracted for each `ai-index.json` entry | + ### `robots` | Option | Type | Default | Description |