diff --git a/package.json b/package.json index f2e2e01..d9c16ac 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@ainyc/aeo-audit", - "version": "1.4.0", + "version": "1.5.0", "description": "The most comprehensive open-source Answer Engine Optimization (AEO) audit tool. Scores websites across 13 ranking factors that determine AI citation.", "type": "module", "main": "./dist/index.js", diff --git a/skills/aeo/SKILL.md b/skills/aeo/SKILL.md index 91172ea..686e1a9 100644 --- a/skills/aeo/SKILL.md +++ b/skills/aeo/SKILL.md @@ -90,9 +90,11 @@ npx @ainyc/aeo-audit@1 "" --sitemap --top-issues --format json Flags: - `--sitemap [url]` — auto-discover `/sitemap.xml` or provide an explicit URL -- `--limit ` — cap pages audited (sorted by sitemap priority) +- `--limit ` — cap pages audited (default 200, sorted by sitemap priority) - `--top-issues` — skip per-page output, show only cross-cutting patterns +Pages are audited with bounded concurrency (5 in flight) to avoid hammering the target origin. + Returns: - Per-page scores and grades - Cross-cutting issues (factors failing across multiple pages) diff --git a/src/cli.ts b/src/cli.ts index 11da84e..64c0044 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -99,7 +99,7 @@ Options: --include-geo Include optional geographic signals factor --include-agent-skills Include optional agent skill exposure factor (Schema.org Action, MCP, form affordances) --sitemap [url] Audit all pages from sitemap (auto-discovers /sitemap.xml or use explicit URL) - --limit Max pages to audit in sitemap mode (sorted by sitemap priority) + --limit Max pages to audit in sitemap mode (default 200, sorted by sitemap priority) --top-issues In sitemap mode, skip per-page output and show only cross-cutting issues -h, --help Show this help message @@ -142,6 +142,13 @@ export async function main(argv: string[] = process.argv): Promise { sitemapUrl: args.sitemapUrl ?? undefined, limit: args.limit ?? undefined, topIssuesOnly: args.topIssues, + onPlan: (plan) => { + if (plan.truncated > 0) { + console.error( + `Notice: sitemap has ${plan.discovered} URLs; auditing top ${plan.willAudit} by priority (--limit ${plan.effectiveLimit}). ${plan.truncated} pages skipped. Pass --limit ${Math.max(plan.discovered, 9999)} to audit all.`, + ) + } + }, } const report = await runSitemapAudit(args.url, options) diff --git a/src/formatters/markdown.ts b/src/formatters/markdown.ts index 16cf70d..51cf316 100644 --- a/src/formatters/markdown.ts +++ b/src/formatters/markdown.ts @@ -63,7 +63,11 @@ export function formatSitemapMarkdown(report: SitemapAuditReport, topIssuesOnly lines.push(``) lines.push(`**Sitemap:** ${report.sitemapUrl}`) lines.push(`**Aggregate Grade:** ${report.aggregateGrade} (${report.aggregateScore}/100)`) - lines.push(`**Pages:** ${report.pagesAudited} audited, ${report.pagesSkipped} skipped, ${report.pagesDiscovered} discovered`) + lines.push(`**Pages:** ${report.pagesAudited} audited of ${report.pagesDiscovered} discovered (${report.pagesFiltered} filtered as non-HTML, ${report.pagesTruncated} truncated by --limit ${report.effectiveLimit})`) + if (report.pagesTruncated > 0) { + lines.push(``) + lines.push(`> **Note:** ${report.pagesTruncated} additional pages were skipped because of the page limit. Pass \`--limit ${Math.max(report.pagesDiscovered, 9999)}\` to audit them all.`) + } lines.push(`**Audited:** ${report.auditedAt}`) lines.push(``) diff --git a/src/formatters/text.ts b/src/formatters/text.ts index 89d7349..f4729dd 100644 --- a/src/formatters/text.ts +++ b/src/formatters/text.ts @@ -82,7 +82,10 @@ export function formatSitemapText(report: SitemapAuditReport, topIssuesOnly = fa lines.push(`${DIM}${report.sitemapUrl}${RESET}`) lines.push(``) lines.push(` ${BOLD}Aggregate Grade:${RESET} ${gc}${BOLD}${report.aggregateGrade}${RESET} ${bar(report.aggregateScore, 30)} ${report.aggregateScore}/100`) - lines.push(` ${DIM}${report.pagesAudited} pages audited, ${report.pagesSkipped} skipped, ${report.pagesDiscovered} discovered${RESET}`) + lines.push(` ${DIM}${report.pagesAudited} pages audited of ${report.pagesDiscovered} discovered (${report.pagesFiltered} filtered, ${report.pagesTruncated} truncated by --limit ${report.effectiveLimit})${RESET}`) + if (report.pagesTruncated > 0) { + lines.push(` ${DIM}Note: ${report.pagesTruncated} additional pages skipped by --limit. Pass --limit ${Math.max(report.pagesDiscovered, 9999)} to audit them all.${RESET}`) + } lines.push(``) if (!topIssuesOnly) { diff --git a/src/sitemap.ts b/src/sitemap.ts index 01f990b..93d9095 100644 --- a/src/sitemap.ts +++ b/src/sitemap.ts @@ -14,6 +14,8 @@ import type { const USER_AGENT = 'AINYC-AEO-Audit/1.0' const SITEMAP_TIMEOUT_MS = 10_000 const SITEMAP_MAX_BYTES = 5 * 1024 * 1024 +const DEFAULT_LIMIT = 200 +const DEFAULT_CONCURRENCY = 5 const SKIP_EXTENSIONS = new Set(['.pdf', '.txt', '.xml', '.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.mp4', '.mp3', '.zip', '.gz', '.css', '.js']) @@ -131,6 +133,26 @@ async function resolveSitemapUrls(sitemapUrl: string): Promise { return entries } +async function mapWithConcurrency( + items: T[], + concurrency: number, + worker: (item: T, index: number) => Promise, +): Promise { + const results = new Array(items.length) + let nextIndex = 0 + const workerCount = Math.max(1, Math.min(concurrency, items.length)) + await Promise.all( + Array.from({ length: workerCount }, async () => { + while (true) { + const i = nextIndex++ + if (i >= items.length) return + results[i] = await worker(items[i], i) + } + }), + ) + return results +} + function buildCrossCuttingIssues(successPages: AuditReport[]): CrossCuttingIssue[] { if (successPages.length === 0) return [] @@ -209,57 +231,78 @@ export async function runSitemapAudit(rawUrl: string, options: SitemapAuditOptio const sitemapUrl = options.sitemapUrl || `${origin}/sitemap.xml` // Fetch and parse sitemap - let entries = await resolveSitemapUrls(sitemapUrl) + const allEntries = await resolveSitemapUrls(sitemapUrl) + const discovered = allEntries.length // Filter to HTML content pages - const allCount = entries.length - entries = entries.filter((e) => !shouldSkipUrl(e.loc)) + const eligible = allEntries.filter((e) => !shouldSkipUrl(e.loc)) + const filtered = discovered - eligible.length // Sort by priority (highest first) if priorities exist - entries.sort((a, b) => (b.priority ?? 0.5) - (a.priority ?? 0.5)) + eligible.sort((a, b) => (b.priority ?? 0.5) - (a.priority ?? 0.5)) - // Apply limit - if (options.limit && options.limit > 0) { - entries = entries.slice(0, options.limit) - } + // Apply limit (default 200 when not specified — large sitemaps are common and + // a full sweep is rarely what the user wants). + const effectiveLimit = options.limit && options.limit > 0 ? options.limit : DEFAULT_LIMIT + const entries = eligible.slice(0, effectiveLimit) + const truncated = eligible.length - entries.length if (entries.length === 0) { throw new AeoAuditError('BAD_INPUT', 'No auditable URLs found in sitemap.') } - const skipped = allCount - entries.length + options.onPlan?.({ + discovered, + filtered, + truncated, + willAudit: entries.length, + effectiveLimit, + }) + const auditOptions: RunAeoAuditOptions = { factors: options.factors, includeGeo: options.includeGeo, } - // Audit each page (sequentially to avoid hammering the target) - const pageResults: SitemapPageResult[] = [] - const successReports: AuditReport[] = [] - - for (const entry of entries) { - try { - const report = await runAeoAudit(entry.loc, auditOptions) - successReports.push(report) - pageResults.push({ - url: report.finalUrl, - overallScore: report.overallScore, - overallGrade: report.overallGrade, - status: 'success', - factors: report.factors, - metadata: report.metadata, - }) - } catch (error) { - const message = error instanceof Error ? error.message : String(error) - pageResults.push({ - url: entry.loc, - overallScore: 0, - overallGrade: 'F', - status: 'error', - error: message, - }) - } - } + // Audit pages with bounded concurrency: 5 workers is a polite ceiling for one + // origin while giving a meaningful speedup over fully sequential. + const settled = await mapWithConcurrency( + entries, + DEFAULT_CONCURRENCY, + async (entry): Promise<{ pageResult: SitemapPageResult; report: AuditReport | null }> => { + try { + const report = await runAeoAudit(entry.loc, auditOptions) + return { + pageResult: { + url: report.finalUrl, + overallScore: report.overallScore, + overallGrade: report.overallGrade, + status: 'success', + factors: report.factors, + metadata: report.metadata, + }, + report, + } + } catch (error) { + const message = error instanceof Error ? error.message : String(error) + return { + pageResult: { + url: entry.loc, + overallScore: 0, + overallGrade: 'F', + status: 'error', + error: message, + }, + report: null, + } + } + }, + ) + + const pageResults: SitemapPageResult[] = settled.map((s) => s.pageResult) + const successReports: AuditReport[] = settled + .map((s) => s.report) + .filter((r): r is AuditReport => r !== null) // Calculate aggregate score from successful audits const successScores = pageResults.filter((p) => p.status === 'success').map((p) => p.overallScore) @@ -273,9 +316,12 @@ export async function runSitemapAudit(rawUrl: string, options: SitemapAuditOptio return { sitemapUrl, auditedAt: new Date().toISOString(), - pagesDiscovered: allCount, + pagesDiscovered: discovered, pagesAudited: entries.length, - pagesSkipped: skipped, + pagesSkipped: filtered + truncated, + pagesFiltered: filtered, + pagesTruncated: truncated, + effectiveLimit, aggregateScore, aggregateGrade: scoreToGrade(aggregateScore), pages: pageResults, @@ -284,4 +330,4 @@ export async function runSitemapAudit(rawUrl: string, options: SitemapAuditOptio } } -export { parseSitemapXml, shouldSkipUrl, buildCrossCuttingIssues } +export { parseSitemapXml, shouldSkipUrl, buildCrossCuttingIssues, mapWithConcurrency } diff --git a/src/types.ts b/src/types.ts index c8cc82f..21b0358 100644 --- a/src/types.ts +++ b/src/types.ts @@ -174,6 +174,9 @@ export interface SitemapAuditReport { pagesDiscovered: number pagesAudited: number pagesSkipped: number + pagesFiltered: number + pagesTruncated: number + effectiveLimit: number aggregateScore: number aggregateGrade: string pages: SitemapPageResult[] @@ -181,8 +184,17 @@ export interface SitemapAuditReport { prioritizedFixes: string[] } +export interface SitemapAuditPlan { + discovered: number + filtered: number + truncated: number + willAudit: number + effectiveLimit: number +} + export interface SitemapAuditOptions extends RunAeoAuditOptions { sitemapUrl?: string limit?: number topIssuesOnly?: boolean + onPlan?: (plan: SitemapAuditPlan) => void } diff --git a/test/sitemap.test.ts b/test/sitemap.test.ts index 28e805b..0a92a91 100644 --- a/test/sitemap.test.ts +++ b/test/sitemap.test.ts @@ -1,6 +1,6 @@ import { test, expect } from 'vitest' -import { parseSitemapXml, shouldSkipUrl } from '../src/sitemap.js' +import { mapWithConcurrency, parseSitemapXml, shouldSkipUrl } from '../src/sitemap.js' test('parseSitemapXml extracts loc and priority from url blocks', () => { const xml = ` @@ -61,3 +61,41 @@ test('shouldSkipUrl allows HTML content pages', () => { expect(shouldSkipUrl('https://example.com/page.html')).toBe(false) expect(shouldSkipUrl('https://example.com/page.htm')).toBe(false) }) + +test('mapWithConcurrency preserves input order and caps in-flight workers', async () => { + const items = Array.from({ length: 20 }, (_, i) => i) + let inFlight = 0 + let peakInFlight = 0 + + const results = await mapWithConcurrency(items, 5, async (item) => { + inFlight += 1 + peakInFlight = Math.max(peakInFlight, inFlight) + // Yield to the event loop a few times so workers actually overlap. + await new Promise((resolve) => setTimeout(resolve, 1)) + inFlight -= 1 + return item * 2 + }) + + expect(results).toEqual(items.map((i) => i * 2)) + expect(peakInFlight).toBeLessThanOrEqual(5) + expect(peakInFlight).toBeGreaterThan(1) +}) + +test('mapWithConcurrency handles empty input', async () => { + const results = await mapWithConcurrency([], 5, async (n) => n) + expect(results).toEqual([]) +}) + +test('mapWithConcurrency caps workers to item count when items < concurrency', async () => { + let peakInFlight = 0 + let inFlight = 0 + const results = await mapWithConcurrency([1, 2], 10, async (n) => { + inFlight += 1 + peakInFlight = Math.max(peakInFlight, inFlight) + await new Promise((resolve) => setTimeout(resolve, 1)) + inFlight -= 1 + return n + }) + expect(results).toEqual([1, 2]) + expect(peakInFlight).toBeLessThanOrEqual(2) +})