From faa2508d1c0849175a3eb7d4609d28335360e898 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Sun, 28 Sep 2025 14:51:57 -0700 Subject: [PATCH 01/20] feat: add ETag and lastModified support for caching in fetchers and strategies --- AGENTS.md | 1 + src/scraper/fetcher/BrowserFetcher.ts | 4 ++ src/scraper/fetcher/FileFetcher.ts | 14 +++++- src/scraper/fetcher/HttpFetcher.ts | 11 +++++ src/scraper/fetcher/types.ts | 12 +++++ .../strategies/GitHubRepoScraperStrategy.ts | 2 + .../strategies/GitHubWikiScraperStrategy.ts | 2 + .../strategies/LocalFileStrategy.test.ts | 6 ++- src/scraper/strategies/LocalFileStrategy.ts | 2 + src/scraper/strategies/WebScraperStrategy.ts | 2 + src/store/DocumentStore.test.ts | 49 +++++++++++++++++++ src/store/DocumentStore.ts | 10 +++- 12 files changed, 110 insertions(+), 5 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index a459d330..170febea 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -66,6 +66,7 @@ - Focus on high value, low effort tests first. Defer complex mocking, complex state management testing and concurrent processing unless explicitly requested by the user. - Always test the intended bevavior, not the implementation details. - Avoid timing sensitive tests unless absolutely necessary. +- Use `npx vite-node` to run individual TypeScript files. ## Git diff --git a/src/scraper/fetcher/BrowserFetcher.ts b/src/scraper/fetcher/BrowserFetcher.ts index 27446da3..d9db4df7 100644 --- a/src/scraper/fetcher/BrowserFetcher.ts +++ b/src/scraper/fetcher/BrowserFetcher.ts @@ -72,12 +72,16 @@ export class BrowserFetcher implements ContentFetcher { const contentType = response.headers()["content-type"] || "text/html"; const { mimeType, charset } = MimeTypeUtils.parseContentType(contentType); + // Extract ETag header for caching + const etag = response.headers().etag; + return { content: contentBuffer, mimeType, charset, encoding: undefined, // Browser handles encoding automatically source: finalUrl, + etag, } satisfies RawContent; } catch (error) { if (options?.signal?.aborted) { diff --git a/src/scraper/fetcher/FileFetcher.ts b/src/scraper/fetcher/FileFetcher.ts index 219c9602..727c40d3 100644 --- a/src/scraper/fetcher/FileFetcher.ts +++ b/src/scraper/fetcher/FileFetcher.ts @@ -1,3 +1,4 @@ +import crypto from "node:crypto"; import fs from "node:fs/promises"; import { ScraperError } from "../../utils/errors"; import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; @@ -28,16 +29,27 @@ export class FileFetcher implements ContentFetcher { } try { - const content = await fs.readFile(filePath); + const [content, stats] = await Promise.all([ + fs.readFile(filePath), + fs.stat(filePath), + ]); // Use enhanced MIME type detection that properly handles source code files const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath); const mimeType = detectedMimeType || "application/octet-stream"; + // Generate pseudo-ETag from last modified time + const etag = crypto + .createHash("md5") + .update(stats.mtime.toISOString()) + .digest("hex"); + return { content, mimeType, source, + etag, + lastModified: stats.mtime.toISOString(), // Don't assume charset for text files - let the pipeline detect it }; } catch (error: unknown) { diff --git a/src/scraper/fetcher/HttpFetcher.ts b/src/scraper/fetcher/HttpFetcher.ts index 3e1da077..32846b4e 100644 --- a/src/scraper/fetcher/HttpFetcher.ts +++ b/src/scraper/fetcher/HttpFetcher.ts @@ -165,12 +165,23 @@ export class HttpFetcher implements ContentFetcher { response.config?.url || source; + // Extract ETag header for caching + const etag = response.headers.etag || response.headers.ETag; + + // Extract Last-Modified header for caching + const lastModified = response.headers["last-modified"]; + const lastModifiedISO = lastModified + ? new Date(lastModified).toISOString() + : undefined; + return { content, mimeType, charset, encoding: contentEncoding, source: finalUrl, + etag, + lastModified: lastModifiedISO, } satisfies RawContent; } catch (error: unknown) { const axiosError = error as AxiosError; diff --git a/src/scraper/fetcher/types.ts b/src/scraper/fetcher/types.ts index 1a76393f..7f401af1 100644 --- a/src/scraper/fetcher/types.ts +++ b/src/scraper/fetcher/types.ts @@ -20,6 +20,18 @@ export interface RawContent { encoding?: string; /** Original source location */ source: string; + /** + * ETag value for caching purposes. + * For HTTP sources, this comes from the ETag header. + * For local files, this is a hash of the last modified date. + */ + etag?: string; + /** + * Last modified timestamp in ISO8601 format. + * For HTTP sources, this comes from the Last-Modified header. + * For local files, this is the file modification time. + */ + lastModified?: string; } /** diff --git a/src/scraper/strategies/GitHubRepoScraperStrategy.ts b/src/scraper/strategies/GitHubRepoScraperStrategy.ts index 1e52a0ad..576fe484 100644 --- a/src/scraper/strategies/GitHubRepoScraperStrategy.ts +++ b/src/scraper/strategies/GitHubRepoScraperStrategy.ts @@ -471,6 +471,8 @@ export class GitHubRepoScraperStrategy extends BaseScraperStrategy { title: hasValidTitle ? processedTitle : fallbackTitle, library: options.library, version: options.version, + etag: rawContent.etag, + lastModified: rawContent.lastModified, }, contentType: rawContent.mimeType, // Preserve the detected MIME type } satisfies Document, diff --git a/src/scraper/strategies/GitHubWikiScraperStrategy.ts b/src/scraper/strategies/GitHubWikiScraperStrategy.ts index 59b3e7d2..e99336a2 100644 --- a/src/scraper/strategies/GitHubWikiScraperStrategy.ts +++ b/src/scraper/strategies/GitHubWikiScraperStrategy.ts @@ -157,6 +157,8 @@ export class GitHubWikiScraperStrategy extends BaseScraperStrategy { : pageTitle, library: options.library, version: options.version, + etag: rawContent.etag, + lastModified: rawContent.lastModified, }, contentType: rawContent.mimeType, }; diff --git a/src/scraper/strategies/LocalFileStrategy.test.ts b/src/scraper/strategies/LocalFileStrategy.test.ts index e871ea0e..0f07a0b9 100644 --- a/src/scraper/strategies/LocalFileStrategy.test.ts +++ b/src/scraper/strategies/LocalFileStrategy.test.ts @@ -49,12 +49,14 @@ describe("LocalFileStrategy", () => { document: { content: "# Test\n\nThis is a test file.", contentType: "text/markdown", - metadata: { + metadata: expect.objectContaining({ url: "file:///test.md", title: "Test", library: "test", version: "1.0", - }, + etag: expect.any(String), + lastModified: expect.any(String), + }), }, }), ); diff --git a/src/scraper/strategies/LocalFileStrategy.ts b/src/scraper/strategies/LocalFileStrategy.ts index 1edfa9d7..0972b8c2 100644 --- a/src/scraper/strategies/LocalFileStrategy.ts +++ b/src/scraper/strategies/LocalFileStrategy.ts @@ -94,6 +94,8 @@ export class LocalFileStrategy extends BaseScraperStrategy { : "Untitled", library: options.library, version: options.version, + etag: rawContent.etag, + lastModified: rawContent.lastModified, }, } satisfies Document, }; diff --git a/src/scraper/strategies/WebScraperStrategy.ts b/src/scraper/strategies/WebScraperStrategy.ts index 6fcf71a0..274ae17a 100644 --- a/src/scraper/strategies/WebScraperStrategy.ts +++ b/src/scraper/strategies/WebScraperStrategy.ts @@ -127,6 +127,8 @@ export class WebScraperStrategy extends BaseScraperStrategy { : "Untitled", library: options.library, version: options.version, + etag: rawContent.etag, + lastModified: rawContent.lastModified, ...processed.metadata, }, } satisfies Document, diff --git a/src/store/DocumentStore.test.ts b/src/store/DocumentStore.test.ts index be1f6de6..c14ebc56 100644 --- a/src/store/DocumentStore.test.ts +++ b/src/store/DocumentStore.test.ts @@ -264,6 +264,55 @@ describe("DocumentStore - With Embeddings", () => { expect(versions).toContain("1.0.0"); expect(versions).toContain("2.0.0"); }); + + it("should store and retrieve etag and lastModified metadata", async () => { + const testEtag = '"abc123-def456"'; + const testLastModified = "2023-12-01T10:30:00Z"; + + const docs: Document[] = [ + { + pageContent: "Test document with etag and lastModified", + metadata: { + title: "ETag Test Doc", + url: "https://example.com/etag-test", + path: ["test"], + etag: testEtag, + lastModified: testLastModified, + }, + }, + ]; + + await store.addDocuments("etagtest", "1.0.0", docs); + + // Query the database directly to verify the etag and last_modified are stored + // @ts-expect-error Accessing private property for testing + const db = store.db; + const pageResult = db + .prepare(` + SELECT p.etag, p.last_modified + FROM pages p + JOIN versions v ON p.version_id = v.id + JOIN libraries l ON v.library_id = l.id + WHERE l.name = ? AND COALESCE(v.name, '') = ? AND p.url = ? + `) + .get("etagtest", "1.0.0", "https://example.com/etag-test") as + | { + etag: string | null; + last_modified: string | null; + } + | undefined; + + expect(pageResult).toBeDefined(); + expect(pageResult?.etag).toBe(testEtag); + expect(pageResult?.last_modified).toBe(testLastModified); + + // Also verify we can retrieve the document and it contains the metadata + const results = await store.findByContent("etagtest", "1.0.0", "etag", 10); + expect(results.length).toBeGreaterThan(0); + + const doc = results[0]; + expect(doc.metadata.url).toBe("https://example.com/etag-test"); + }); }); describe("Hybrid Search with Embeddings", () => { diff --git a/src/store/DocumentStore.ts b/src/store/DocumentStore.ts index 8a4f9240..4b023002 100644 --- a/src/store/DocumentStore.ts +++ b/src/store/DocumentStore.ts @@ -960,13 +960,19 @@ export class DocumentStore { // Extract content type from metadata if available const contentType = firstDoc.metadata.contentType || null; + // Extract etag from document metadata if available + const etag = firstDoc.metadata.etag || null; + + // Extract lastModified from document metadata if available + const lastModified = firstDoc.metadata.lastModified || null; + // Insert or update page record this.statements.insertPage.run( versionId, url, title, - null, // etag - will be populated during scraping - null, // last_modified - will be populated during scraping + etag, + lastModified, contentType, ); From ba3a72e2a27118cb4eb3e7c6a6877b21284ed32d Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Sat, 4 Oct 2025 05:48:26 -0700 Subject: [PATCH 02/20] feat: implement refresh job functionality --- package-lock.json | 4 +- src/pipeline/PipelineManager.ts | 97 +++++++++++++++++++++++++- src/pipeline/trpc/interfaces.ts | 3 +- src/store/DocumentManagementService.ts | 21 ++++++ src/store/DocumentStore.ts | 42 +++++++++++ src/tools/RefreshVersionTool.ts | 95 +++++++++++++++++++++++++ src/tools/ScrapeTool.ts | 2 +- 7 files changed, 258 insertions(+), 6 deletions(-) create mode 100644 src/tools/RefreshVersionTool.ts diff --git a/package-lock.json b/package-lock.json index 4ab05410..29b6a142 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@arabold/docs-mcp-server", - "version": "1.26.1", + "version": "1.26.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@arabold/docs-mcp-server", - "version": "1.26.1", + "version": "1.26.0", "hasInstallScript": true, "license": "MIT", "dependencies": { diff --git a/src/pipeline/PipelineManager.ts b/src/pipeline/PipelineManager.ts index 213261bb..8dd09b3c 100644 --- a/src/pipeline/PipelineManager.ts +++ b/src/pipeline/PipelineManager.ts @@ -235,7 +235,7 @@ export class PipelineManager implements IPipeline { /** * Enqueues a new document processing job, aborting any existing QUEUED/RUNNING job for the same library+version (including unversioned). */ - async enqueueJob( + async enqueueScrapeJob( library: string, version: string | undefined | null, options: ScraperOptions, @@ -322,6 +322,99 @@ export class PipelineManager implements IPipeline { return jobId; } + /** + * Enqueues a refresh job for an existing library version by re-scraping all pages + * and using Etag comparison to skip unchanged content. + */ + async enqueueRefreshJob( + library: string, + version: string | undefined | null, + ): Promise { + // Normalize version: treat undefined/null as "" (unversioned) + const normalizedVersion = version ?? ""; + + // First, check if the library version exists + try { + const versionId = await this.store.ensureVersion({ + library, + version: normalizedVersion, + }); + + // Get all pages for this version + const pages = await this.store.getPagesByVersionId(versionId); + + if (pages.length === 0) { + throw new Error( + `No pages found for ${library}@${normalizedVersion || "unversioned"}. Cannot refresh an empty version.`, + ); + } + + logger.info( + `🔄 Starting refresh for ${library}@${normalizedVersion || "unversioned"} with ${pages.length} page(s)`, + ); + + const jobId = uuidv4(); + const abortController = new AbortController(); + let resolveCompletion!: () => void; + let rejectCompletion!: (reason?: unknown) => void; + + const completionPromise = new Promise((resolve, reject) => { + resolveCompletion = resolve; + rejectCompletion = reject; + }); + // Prevent unhandled rejection warnings if rejection occurs before consumers attach handlers + completionPromise.catch(() => {}); + + const job: InternalPipelineJob = { + id: jobId, + library, + version: normalizedVersion, + status: PipelineJobStatus.QUEUED, + progress: null, + error: null, + createdAt: new Date(), + startedAt: null, + finishedAt: null, + abortController, + completionPromise, + resolveCompletion, + rejectCompletion, + // Database fields (single source of truth) + versionId, + versionStatus: this.mapJobStatusToVersionStatus(PipelineJobStatus.QUEUED), + progressPages: 0, + progressMaxPages: pages.length, + errorMessage: null, + updatedAt: new Date(), + sourceUrl: null, // No single source URL for refresh jobs + scraperOptions: null, + // Add refresh-specific metadata + refreshPages: pages, // Store the pages to refresh + }; + + this.jobMap.set(jobId, job); + this.jobQueue.push(jobId); + logger.info( + `📝 Refresh job enqueued: ${jobId} for ${library}${normalizedVersion ? `@${normalizedVersion}` : " (unversioned)"} with ${pages.length} pages`, + ); + + // Update database status to QUEUED + await this.updateJobStatus(job, PipelineJobStatus.QUEUED); + + // Trigger processing if manager is running + if (this.isRunning) { + this._processQueue().catch((error) => { + logger.error(`❌ Error in processQueue during refresh enqueue: ${error}`); + }); + } + + return jobId; + } catch (error) { + logger.error(`❌ Failed to enqueue refresh job: ${error}`); + throw error; + } + } + /** * Enqueues a job using stored scraper options from a previous indexing run. * If no stored options are found, throws an error. @@ -360,7 +453,7 @@ export class PipelineManager implements IPipeline { `🔄 Re-indexing ${library}@${normalizedVersion || "unversioned"} with stored options from ${stored.sourceUrl}`, ); - return this.enqueueJob(library, normalizedVersion, completeOptions); + return this.enqueueScrapeJob(library, normalizedVersion, completeOptions); } catch (error) { logger.error(`❌ Failed to enqueue job with stored options: ${error}`); throw error; diff --git a/src/pipeline/trpc/interfaces.ts b/src/pipeline/trpc/interfaces.ts index 9d806b22..0fd18173 100644 --- a/src/pipeline/trpc/interfaces.ts +++ b/src/pipeline/trpc/interfaces.ts @@ -20,11 +20,12 @@ export interface PipelineOptions { export interface IPipeline { start(): Promise; stop(): Promise; - enqueueJob( + enqueueScrapeJob( library: string, version: string | undefined | null, options: ScraperOptions, ): Promise; + enqueueRefreshJob(library: string, version: string | undefined | null): Promise; getJob(jobId: string): Promise; getJobs(status?: PipelineJobStatus): Promise; cancelJob(jobId: string): Promise; diff --git a/src/store/DocumentManagementService.ts b/src/store/DocumentManagementService.ts index 7b0df7f4..dfddd641 100644 --- a/src/store/DocumentManagementService.ts +++ b/src/store/DocumentManagementService.ts @@ -339,6 +339,27 @@ export class DocumentManagementService { logger.info(`🗑️ Deleted ${count} documents`); } + /** + * Removes all documents for a specific page ID. + * This is more efficient than URL-based deletion when the page ID is known. + */ + async removeDocumentsByPageId(pageId: number): Promise { + logger.debug(`🗑️ Removing documents for page ID: ${pageId}`); + const count = await this.store.deleteDocumentsByPageId(pageId); + logger.debug(`🗑️ Deleted ${count} documents for page ID: ${pageId}`); + return count; + } + + /** + * Retrieves all pages for a specific version ID with their metadata. + * Used for refresh operations to get existing pages with their ETags. + */ + async getPagesByVersionId( + versionId: number, + ): Promise> { + return this.store.getPagesByVersionId(versionId); + } + /** * Completely removes a library version and all associated documents. * Also removes the library if no other versions remain. diff --git a/src/store/DocumentStore.ts b/src/store/DocumentStore.ts index 4b023002..67c7bdc3 100644 --- a/src/store/DocumentStore.ts +++ b/src/store/DocumentStore.ts @@ -80,6 +80,7 @@ export class DocumentStore { getPageId: Database.Statement<[number, string]>; deleteDocuments: Database.Statement<[string, string]>; deleteDocumentsByUrl: Database.Statement<[string, string, string]>; + deleteDocumentsByPageId: Database.Statement<[number]>; deletePages: Database.Statement<[string, string]>; queryVersions: Database.Statement<[string]>; checkExists: Database.Statement<[string, string]>; @@ -114,6 +115,7 @@ export class DocumentStore { deleteLibraryById: Database.Statement<[number]>; countVersionsByLibraryId: Database.Statement<[number]>; getVersionId: Database.Statement<[string, string]>; + getPagesByVersionId: Database.Statement<[number]>; }; /** @@ -239,6 +241,9 @@ export class DocumentStore { WHERE p.url = ? AND l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '') )`, ), + deleteDocumentsByPageId: this.db.prepare<[number]>( + "DELETE FROM documents WHERE page_id = ?", + ), deletePages: this.db.prepare<[string, string]>( `DELETE FROM pages WHERE version_id IN ( @@ -383,6 +388,9 @@ export class DocumentStore { JOIN libraries l ON v.library_id = l.id WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')`, ), + getPagesByVersionId: this.db.prepare<[number]>( + "SELECT id, url, etag FROM pages WHERE version_id = ?", + ), }; this.statements = statements; } @@ -1074,6 +1082,40 @@ export class DocumentStore { } } + /** + * Removes all documents for a specific page ID. + * This is more efficient than URL-based deletion when the page ID is known. + * @returns Number of documents deleted + */ + async deleteDocumentsByPageId(pageId: number): Promise { + try { + const result = this.statements.deleteDocumentsByPageId.run(pageId); + return result.changes; + } catch (error) { + throw new ConnectionError("Failed to delete documents by page ID", error); + } + } + + /** + * Retrieves all pages for a specific version ID with their metadata. + * Used for refresh operations to get existing pages with their ETags. + * @returns Array of page records + */ + async getPagesByVersionId( + versionId: number, + ): Promise> { + try { + const result = this.statements.getPagesByVersionId.all(versionId) as Array<{ + id: number; + url: string; + etag: string | null; + }>; + return result; + } catch (error) { + throw new ConnectionError("Failed to get pages by version ID", error); + } + } + /** * Completely removes a library version and all associated documents. * Optionally removes the library if no other versions remain. diff --git a/src/tools/RefreshVersionTool.ts b/src/tools/RefreshVersionTool.ts new file mode 100644 index 00000000..14b73ac8 --- /dev/null +++ b/src/tools/RefreshVersionTool.ts @@ -0,0 +1,95 @@ +import * as semver from "semver"; +import type { IPipeline } from "../pipeline/trpc/interfaces"; +import { logger } from "../utils/logger"; +import { ValidationError } from "./errors"; + +export interface RefreshVersionToolOptions { + library: string; + version?: string | null; // Make version optional + /** If false, returns jobId immediately without waiting. Defaults to true. */ + waitForCompletion?: boolean; +} + +export interface RefreshResult { + /** Indicates the number of pages refreshed if waitForCompletion was true and the job succeeded. May be 0 or inaccurate if job failed or waitForCompletion was false. */ + pagesRefreshed: number; +} + +/** Return type for RefreshVersionTool.execute */ +export type RefreshExecuteResult = RefreshResult | { jobId: string }; + +/** + * Tool for refreshing an existing library version by re-scraping all pages + * and using Etag comparison to skip unchanged content. + */ +export class RefreshVersionTool { + private pipeline: IPipeline; + + constructor(pipeline: IPipeline) { + this.pipeline = pipeline; + } + + async execute(options: RefreshVersionToolOptions): Promise { + const { library, version, waitForCompletion = true } = options; + + let internalVersion: string; + const partialVersionRegex = /^\d+(\.\d+)?$/; // Matches '1' or '1.2' + + if (version === null || version === undefined) { + internalVersion = ""; + } else { + const validFullVersion = semver.valid(version); + if (validFullVersion) { + internalVersion = validFullVersion; + } else if (partialVersionRegex.test(version)) { + const coercedVersion = semver.coerce(version); + if (coercedVersion) { + internalVersion = coercedVersion.version; + } else { + throw new ValidationError( + `Invalid version format for refreshing: '${version}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`, + "RefreshVersionTool", + ); + } + } else { + throw new ValidationError( + `Invalid version format for refreshing: '${version}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`, + "RefreshVersionTool", + ); + } + } + + internalVersion = internalVersion.toLowerCase(); + + // Use the injected pipeline instance + const pipeline = this.pipeline; + + // Normalize pipeline version argument: use null for unversioned to be explicit cross-platform + const refreshVersion: string | null = internalVersion === "" ? null : internalVersion; + + // Enqueue the refresh job using the injected pipeline + const jobId = await pipeline.enqueueRefreshJob(library, refreshVersion); + + // Conditionally wait for completion + if (waitForCompletion) { + try { + await pipeline.waitForJobCompletion(jobId); + // Fetch final job state to get status and potentially final page count + const finalJob = await pipeline.getJob(jobId); + const finalPagesRefreshed = finalJob?.progress?.pagesScraped ?? 0; // Get count from final job state + logger.debug( + `Refresh job ${jobId} finished with status ${finalJob?.status}. Pages refreshed: ${finalPagesRefreshed}`, + ); + return { + pagesRefreshed: finalPagesRefreshed, + }; + } catch (error) { + logger.error(`❌ Refresh job ${jobId} failed or was cancelled: ${error}`); + throw error; // Re-throw so the caller knows it failed + } + } + + // If not waiting, return the job ID immediately + return { jobId }; + } +} diff --git a/src/tools/ScrapeTool.ts b/src/tools/ScrapeTool.ts index 4208cf2f..c98ddbf6 100644 --- a/src/tools/ScrapeTool.ts +++ b/src/tools/ScrapeTool.ts @@ -132,7 +132,7 @@ export class ScrapeTool { const enqueueVersion: string | null = internalVersion === "" ? null : internalVersion; // Enqueue the job using the injected pipeline - const jobId = await pipeline.enqueueJob(library, enqueueVersion, { + const jobId = await pipeline.enqueueScrapeJob(library, enqueueVersion, { url: url, library: library, version: internalVersion, From 9fc6cf9a46b7e89d088be62cfdc98e79d4a9a0d7 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Mon, 27 Oct 2025 07:48:56 -0700 Subject: [PATCH 03/20] feat(ci): add type checking script and tsconfig for tests --- .github/workflows/ci.yml | 19 +++++++++++-------- package.json | 1 + tsconfig.test.json | 5 +++++ 3 files changed, 17 insertions(+), 8 deletions(-) create mode 100644 tsconfig.test.json diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 69841ed9..135cb3dd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,9 +2,9 @@ name: CI on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] jobs: lint: @@ -17,8 +17,8 @@ jobs: - name: Set up Node.js uses: actions/setup-node@v4 with: - node-version: '>=20.0.0' # Match engines requirement in package.json - cache: 'npm' + node-version: ">=20.0.0" # Match engines requirement in package.json + cache: "npm" - name: Install dependencies run: npm ci @@ -26,6 +26,9 @@ jobs: - name: Run linter run: npm run lint + - name: Run type checker + run: npm run typecheck + test: name: Test runs-on: ubuntu-latest @@ -37,8 +40,8 @@ jobs: - name: Set up Node.js uses: actions/setup-node@v4 with: - node-version: '>=20.0.0' - cache: 'npm' + node-version: ">=20.0.0" + cache: "npm" - name: Install dependencies run: npm ci @@ -60,8 +63,8 @@ jobs: - name: Set up Node.js uses: actions/setup-node@v4 with: - node-version: '>=20.0.0' - cache: 'npm' + node-version: ">=20.0.0" + cache: "npm" - name: Install dependencies run: npm ci diff --git a/package.json b/package.json index 6610d537..7fcfcc1f 100644 --- a/package.json +++ b/package.json @@ -36,6 +36,7 @@ "test:e2e:watch": "vitest --config test/vitest.config.ts", "lint": "biome check .", "lint:fix": "biome check . --fix", + "typecheck": "npx tsc --noEmit --project tsconfig.test.json", "format": "biome format . --write", "postinstall": "echo 'Skipping Playwright browser install. See README.md for details.'" }, diff --git a/tsconfig.test.json b/tsconfig.test.json new file mode 100644 index 00000000..631f88a4 --- /dev/null +++ b/tsconfig.test.json @@ -0,0 +1,5 @@ +{ + "extends": "./tsconfig.json", + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +} From 93a6ee33ab0e787bd0a880ffd30d1456f49b9323 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Mon, 27 Oct 2025 07:52:11 -0700 Subject: [PATCH 04/20] feat(scraper): implement efficient version refresh using ETags This commit introduces a version refresh feature that enables efficient re-indexing of previously scraped library versions. By leveraging HTTP ETags, the new mechanism avoids re-processing unchanged pages, significantly reducing processing time, bandwidth usage, and embedding costs. Previously, re-indexing a library version was a wasteful process that required deleting all existing data and re-scraping every page from scratch, even if most of the content was unchanged. This implementation introduces a refresh mechanism that re-visits all previously scraped pages with their stored ETags. This allows the scraper to: - **Skip** unchanged pages (HTTP 304 Not Modified). - **Re-process** only pages that have changed (HTTP 200 OK). - **Delete** documents for pages that are no longer available (HTTP 404 Not Found). - **Reused Existing Scraper Infrastructure**: The refresh operation is a standard scrape job with a pre-populated `initialQueue`. This leverages existing logic for progress tracking, error handling, and state management. - **Database Schema Update**: A `depth` column has been added to the `pages` table to ensure that refresh operations respect the original `maxDepth` constraints. A database migration (`010-add-depth-to-pages.sql`) is included to apply this change. - **Conditional Fetching**: The scraper's `processItem` logic has been updated to handle conditional requests. It now correctly processes 304, 404, and 200 HTTP responses to either skip, delete, or update documents. - **Pipeline Manager Integration**: A new `enqueueRefreshJob` method was added to the `PipelineManager` to orchestrate the refresh process by fetching pages from the database and populating the `initialQueue`. --- db/migrations/010-add-depth-to-pages.sql | 10 + src/cli/utils.ts | 4 +- src/pipeline/PipelineClient.test.ts | 18 +- src/pipeline/PipelineClient.ts | 26 +- src/pipeline/PipelineFactory.ts | 3 +- src/pipeline/PipelineManager.test.ts | 185 ++++- src/pipeline/PipelineManager.ts | 104 +-- src/pipeline/PipelineWorker.test.ts | 186 +++-- src/pipeline/PipelineWorker.ts | 88 ++- src/pipeline/trpc/router.ts | 34 +- src/pipeline/types.ts | 28 +- src/scraper/ScraperService.test.ts | 14 +- src/scraper/ScraperService.ts | 4 +- src/scraper/fetcher/BrowserFetcher.ts | 8 +- src/scraper/fetcher/FileFetcher.test.ts | 27 +- src/scraper/fetcher/FileFetcher.ts | 56 +- src/scraper/fetcher/HttpFetcher.test.ts | 29 +- src/scraper/fetcher/HttpFetcher.ts | 37 +- src/scraper/fetcher/types.ts | 40 ++ .../HtmlCheerioParserMiddleware.test.ts | 2 +- .../HtmlJsExecutorMiddleware.test.ts | 6 +- .../HtmlLinkExtractorMiddleware.test.ts | 2 +- .../HtmlMetadataExtractorMiddleware.test.ts | 14 +- .../HtmlMetadataExtractorMiddleware.ts | 2 +- .../HtmlNormalizationMiddleware.test.ts | 4 +- .../HtmlPlaywrightMiddleware.test.ts | 2 +- .../middleware/HtmlPlaywrightMiddleware.ts | 5 +- .../HtmlSanitizerMiddleware.test.ts | 2 +- .../HtmlToMarkdownMiddleware.test.ts | 2 +- .../MarkdownLinkExtractorMiddleware.test.ts | 2 +- ...arkdownMetadataExtractorMiddleware.test.ts | 14 +- .../MarkdownMetadataExtractorMiddleware.ts | 2 +- src/scraper/middleware/types.ts | 6 +- src/scraper/pipelines/BasePipeline.test.ts | 23 +- src/scraper/pipelines/BasePipeline.ts | 8 +- .../pipelines/HtmlPipeline.charset.test.ts | 5 +- src/scraper/pipelines/HtmlPipeline.test.ts | 30 +- src/scraper/pipelines/HtmlPipeline.ts | 15 +- src/scraper/pipelines/JsonPipeline.test.ts | 134 ++-- src/scraper/pipelines/JsonPipeline.ts | 61 +- .../pipelines/MarkdownPipeline.test.ts | 67 +- src/scraper/pipelines/MarkdownPipeline.ts | 14 +- .../PipelineFactory.integration.test.ts | 151 ++-- .../pipelines/SourceCodePipeline.test.ts | 99 +-- src/scraper/pipelines/SourceCodePipeline.ts | 26 +- src/scraper/pipelines/TextPipeline.test.ts | 116 +-- src/scraper/pipelines/TextPipeline.ts | 20 +- src/scraper/pipelines/types.ts | 25 +- .../strategies/BaseScraperStrategy.test.ts | 213 +++++- src/scraper/strategies/BaseScraperStrategy.ts | 215 ++++-- .../GitHubRepoScraperStrategy.test.ts | 131 ++-- .../strategies/GitHubRepoScraperStrategy.ts | 70 +- .../strategies/GitHubScraperStrategy.test.ts | 10 +- .../strategies/GitHubScraperStrategy.ts | 6 +- .../GitHubWikiScraperStrategy.test.ts | 174 ++--- .../strategies/GitHubWikiScraperStrategy.ts | 56 +- .../strategies/LocalFileStrategy.test.ts | 581 ++++++++++----- src/scraper/strategies/LocalFileStrategy.ts | 79 ++- src/scraper/strategies/NpmScraperStrategy.ts | 4 +- src/scraper/strategies/PyPiScraperStrategy.ts | 4 +- .../strategies/WebScraperStrategy.test.ts | 501 ++++++++++--- src/scraper/strategies/WebScraperStrategy.ts | 124 ++-- src/scraper/types.ts | 91 ++- src/splitter/GreedySplitter.test.ts | 28 +- src/splitter/GreedySplitter.ts | 22 +- src/splitter/JsonDocumentSplitter.ts | 16 +- src/splitter/SemanticMarkdownSplitter.ts | 12 +- src/splitter/TextDocumentSplitter.ts | 4 +- .../TreesitterSourceCodeSplitter.ts | 12 +- src/splitter/types.ts | 4 +- src/store/DocumentManagementService.test.ts | 142 +--- src/store/DocumentManagementService.ts | 138 ++-- src/store/DocumentRetrieverService.test.ts | 312 ++++---- src/store/DocumentRetrieverService.ts | 25 +- src/store/DocumentStore.test.ts | 668 +++++++++--------- src/store/DocumentStore.ts | 369 +++++----- .../ContentAssemblyStrategyFactory.ts | 2 +- .../HierarchicalAssemblyStrategy.test.ts | 418 ++++++----- .../HierarchicalAssemblyStrategy.ts | 90 ++- .../MarkdownAssemblyStrategy.test.ts | 118 ++-- .../strategies/MarkdownAssemblyStrategy.ts | 22 +- src/store/assembly/types.ts | 12 +- src/store/types.ts | 73 +- src/tools/FetchUrlTool.ts | 8 +- src/tools/ListJobsTool.test.ts | 20 +- src/tools/ScrapeTool.test.ts | 20 +- src/types/index.ts | 43 -- test/vector-search-e2e.test.ts | 2 +- 88 files changed, 3854 insertions(+), 2745 deletions(-) create mode 100644 db/migrations/010-add-depth-to-pages.sql diff --git a/db/migrations/010-add-depth-to-pages.sql b/db/migrations/010-add-depth-to-pages.sql new file mode 100644 index 00000000..0c69ec74 --- /dev/null +++ b/db/migrations/010-add-depth-to-pages.sql @@ -0,0 +1,10 @@ +-- Migration 010: Add depth column to pages table for refresh functionality +-- This enables tracking the original crawl depth of each page, which is essential +-- for maintaining consistent depth constraints during refresh operations. + +-- Add depth column to pages table +ALTER TABLE pages ADD COLUMN depth INTEGER; + +-- Backfill existing pages with depth 0 (conservative default) +-- This ensures all existing pages have a valid depth value +UPDATE pages SET depth = 0 WHERE depth IS NULL; diff --git a/src/cli/utils.ts b/src/cli/utils.ts index 0fd819bc..1c3a8316 100644 --- a/src/cli/utils.ts +++ b/src/cli/utils.ts @@ -173,7 +173,7 @@ export async function createPipelineWithCallbacks( ): Promise { logger.debug(`Initializing pipeline with options: ${JSON.stringify(options)}`); const { serverUrl, ...rest } = options; - const pipeline = serverUrl + const pipeline: IPipeline = serverUrl ? await PipelineFactory.createPipeline(undefined, { serverUrl, ...rest }) : await (async () => { if (!docService) { @@ -194,7 +194,7 @@ export async function createPipelineWithCallbacks( }, onJobError: async (job, error, document) => { logger.warn( - `⚠️ Job ${job.id} error ${document ? `on document ${document.metadata.url}` : ""}: ${error.message}`, + `⚠️ Job ${job.id} error ${document ? `on document ${document.url}` : ""}: ${error.message}`, ); }, }); diff --git a/src/pipeline/PipelineClient.test.ts b/src/pipeline/PipelineClient.test.ts index 13c88f9b..668ffda2 100644 --- a/src/pipeline/PipelineClient.test.ts +++ b/src/pipeline/PipelineClient.test.ts @@ -6,7 +6,8 @@ vi.mock("../utils/logger"); // Mock tRPC client factory const mockClient: any = { ping: { query: vi.fn() }, - enqueueJob: { mutate: vi.fn() }, + enqueueScrapeJob: { mutate: vi.fn() }, + enqueueRefreshJob: { mutate: vi.fn() }, getJob: { query: vi.fn() }, getJobs: { query: vi.fn() }, cancelJob: { mutate: vi.fn() }, @@ -28,7 +29,8 @@ describe("PipelineClient", () => { vi.resetAllMocks(); // Reset default mock behaviors mockClient.ping.query.mockResolvedValue({ status: "ok" }); - mockClient.enqueueJob.mutate.mockResolvedValue({ jobId: "job-123" }); + mockClient.enqueueScrapeJob.mutate.mockResolvedValue({ jobId: "job-123" }); + mockClient.enqueueRefreshJob.mutate.mockResolvedValue({ jobId: "job-456" }); mockClient.getJob.query.mockResolvedValue(undefined); mockClient.getJobs.query.mockResolvedValue({ jobs: [] }); mockClient.cancelJob.mutate.mockResolvedValue({ success: true }); @@ -50,18 +52,18 @@ describe("PipelineClient", () => { }); }); - describe("enqueueJob", () => { + describe("enqueueScrapeJob", () => { it("should delegate job creation to external API", async () => { const mockJobId = "job-123"; - mockClient.enqueueJob.mutate.mockResolvedValueOnce({ jobId: mockJobId }); - const jobId = await client.enqueueJob("react", "18.0.0", { + mockClient.enqueueScrapeJob.mutate.mockResolvedValueOnce({ jobId: mockJobId }); + const jobId = await client.enqueueScrapeJob("react", "18.0.0", { url: "https://react.dev", library: "react", version: "18.0.0", }); expect(jobId).toBe(mockJobId); - expect(mockClient.enqueueJob.mutate).toHaveBeenCalledWith({ + expect(mockClient.enqueueScrapeJob.mutate).toHaveBeenCalledWith({ library: "react", version: "18.0.0", options: { @@ -73,9 +75,9 @@ describe("PipelineClient", () => { }); it("should handle API errors gracefully", async () => { - mockClient.enqueueJob.mutate.mockRejectedValueOnce(new Error("Bad request")); + mockClient.enqueueScrapeJob.mutate.mockRejectedValueOnce(new Error("Bad request")); - await expect(client.enqueueJob("invalid", null, {} as any)).rejects.toThrow( + await expect(client.enqueueScrapeJob("invalid", null, {} as any)).rejects.toThrow( "Failed to enqueue job: Bad request", ); }); diff --git a/src/pipeline/PipelineClient.ts b/src/pipeline/PipelineClient.ts index 85532d9d..c680c189 100644 --- a/src/pipeline/PipelineClient.ts +++ b/src/pipeline/PipelineClient.ts @@ -68,7 +68,7 @@ export class PipelineClient implements IPipeline { logger.debug("PipelineClient stopped"); } - async enqueueJob( + async enqueueScrapeJob( library: string, version: string | undefined | null, options: ScraperOptions, @@ -78,7 +78,7 @@ export class PipelineClient implements IPipeline { typeof version === "string" && version.trim().length === 0 ? null : (version ?? null); - const result = await this.client.enqueueJob.mutate({ + const result = await this.client.enqueueScrapeJob.mutate({ library, version: normalizedVersion, options, @@ -92,6 +92,28 @@ export class PipelineClient implements IPipeline { } } + async enqueueRefreshJob( + library: string, + version: string | undefined | null, + ): Promise { + try { + const normalizedVersion = + typeof version === "string" && version.trim().length === 0 + ? null + : (version ?? null); + const result = await this.client.enqueueRefreshJob.mutate({ + library, + version: normalizedVersion, + }); + logger.debug(`Refresh job ${result.jobId} enqueued successfully`); + return result.jobId; + } catch (error) { + throw new Error( + `Failed to enqueue refresh job: ${error instanceof Error ? error.message : String(error)}`, + ); + } + } + async getJob(jobId: string): Promise { try { const serializedJob = await this.client.getJob.query({ id: jobId }); diff --git a/src/pipeline/PipelineFactory.ts b/src/pipeline/PipelineFactory.ts index e2e9aac9..19959b4b 100644 --- a/src/pipeline/PipelineFactory.ts +++ b/src/pipeline/PipelineFactory.ts @@ -24,8 +24,7 @@ export namespace PipelineFactory { // Overload: Remote pipeline client (out-of-process worker) export async function createPipeline( docService: undefined, - options: Required> & - Omit, + options: PipelineOptions & { serverUrl: string }, ): Promise; // Implementation export async function createPipeline( diff --git a/src/pipeline/PipelineManager.test.ts b/src/pipeline/PipelineManager.test.ts index 37fa75bb..ac2d8b40 100644 --- a/src/pipeline/PipelineManager.test.ts +++ b/src/pipeline/PipelineManager.test.ts @@ -15,7 +15,7 @@ vi.mock("uuid", () => { }); import { afterEach, beforeEach, describe, expect, it, type Mock, vi } from "vitest"; -import type { ScraperProgress } from "../scraper/types"; +import type { ScraperProgressEvent } from "../scraper/types"; import type { DocumentManagementService } from "../store/DocumentManagementService"; import { ListJobsTool } from "../tools/ListJobsTool"; import { PipelineManager } from "./PipelineManager"; @@ -67,7 +67,11 @@ describe("PipelineManager", () => { progress: null, error: null, sourceUrl: "https://example.com", - scraperOptions: null, + scraperOptions: { + url: "https://example.com", + library: "test-lib", + version: "1.0.0", + }, abortController: new AbortController(), completionPromise: Promise.resolve(), resolveCompletion: () => {}, @@ -79,13 +83,22 @@ describe("PipelineManager", () => { const createTestProgress = ( pagesScraped: number, totalPages: number, - ): ScraperProgress => ({ + ): ScraperProgressEvent => ({ pagesScraped, totalPages, currentUrl: `https://example.com/page-${pagesScraped}`, depth: 1, maxDepth: 3, totalDiscovered: 0, + result: { + url: `https://example.com/page-${pagesScraped}`, + title: `Page ${pagesScraped}`, + contentType: "text/html", + textContent: "", + links: [], + errors: [], + chunks: [], + }, }); beforeEach(() => { @@ -98,6 +111,10 @@ describe("PipelineManager", () => { updateVersionStatus: vi.fn().mockResolvedValue(undefined), updateVersionProgress: vi.fn().mockResolvedValue(undefined), // For progress tests getVersionsByStatus: vi.fn().mockResolvedValue([]), + // Refresh job methods + ensureVersion: vi.fn().mockResolvedValue(1), + getPagesByVersionId: vi.fn().mockResolvedValue([]), + getScraperOptions: vi.fn().mockResolvedValue(null), }; // Mock the worker's executeJob method @@ -128,7 +145,7 @@ describe("PipelineManager", () => { // --- Enqueueing Tests --- it("should enqueue a job with QUEUED status and return a job ID", async () => { const options = { url: "http://a.com", library: "libA", version: "1.0" }; - const jobId = await manager.enqueueJob("libA", "1.0", options); + const jobId = await manager.enqueueScrapeJob("libA", "1.0", options); const job = await manager.getJob(jobId); expect(job?.status).toBe(PipelineJobStatus.QUEUED); expect(job?.library).toBe("libA"); @@ -149,7 +166,7 @@ describe("PipelineManager", () => { maxPages: 1, maxDepth: 1, }; - const jobId = await manager.enqueueJob("libA", "1.0", options); + const jobId = await manager.enqueueScrapeJob("libA", "1.0", options); await manager.start(); await vi.advanceTimersByTimeAsync(1); const job = await manager.getJob(jobId); @@ -160,7 +177,7 @@ describe("PipelineManager", () => { it("should complete a job and transition to COMPLETED", async () => { const options = { url: "http://a.com", library: "libA", version: "1.0" }; - const jobId = await manager.enqueueJob("libA", "1.0", options); + const jobId = await manager.enqueueScrapeJob("libA", "1.0", options); await manager.start(); await vi.advanceTimersByTimeAsync(1); await manager.waitForJobCompletion(jobId); @@ -189,7 +206,7 @@ describe("PipelineManager", () => { }), ); } - const jobId1 = await manager.enqueueJob( + const jobId1 = await manager.enqueueScrapeJob( "libA", desc === "unversioned" ? undefined : "1.0", options1, @@ -204,7 +221,7 @@ describe("PipelineManager", () => { library: "libA", version: desc === "unversioned" ? "" : "1.0", }; - const jobId2 = await manager.enqueueJob( + const jobId2 = await manager.enqueueScrapeJob( "libA", desc === "unversioned" ? undefined : "1.0", options2, @@ -228,7 +245,7 @@ describe("PipelineManager", () => { it("should transition job to FAILED if worker throws", async () => { mockWorkerInstance.executeJob.mockRejectedValue(new Error("fail")); const options = { url: "http://fail.com", library: "libFail", version: "1.0" }; - const jobId = await manager.enqueueJob("libFail", "1.0", options); + const jobId = await manager.enqueueScrapeJob("libFail", "1.0", options); await manager.start(); await vi.advanceTimersByTimeAsync(1); await manager.waitForJobCompletion(jobId).catch(() => {}); // Handle expected rejection @@ -245,7 +262,7 @@ describe("PipelineManager", () => { }), ); const options = { url: "http://cancel.com", library: "libCancel", version: "1.0" }; - const jobId = await manager.enqueueJob("libCancel", "1.0", options); + const jobId = await manager.enqueueScrapeJob("libCancel", "1.0", options); await manager.start(); await vi.advanceTimersByTimeAsync(1); await manager.cancelJob(jobId); @@ -272,7 +289,7 @@ describe("PipelineManager", () => { library: "libProgress", version: "1.0", }; - const jobId = await manager.enqueueJob("libProgress", "1.0", options); + const jobId = await manager.enqueueScrapeJob("libProgress", "1.0", options); await manager.start(); await vi.advanceTimersByTimeAsync(1); await manager.waitForJobCompletion(jobId); @@ -286,8 +303,8 @@ describe("PipelineManager", () => { const optionsB = { url: "http://b.com", library: "libB", version: "1.0" }; const pendingPromise = new Promise(() => {}); mockWorkerInstance.executeJob.mockReturnValue(pendingPromise); - const jobIdA = await manager.enqueueJob("libA", "1.0", optionsA); - const jobIdB = await manager.enqueueJob("libB", "1.0", optionsB); + const jobIdA = await manager.enqueueScrapeJob("libA", "1.0", optionsA); + const jobIdB = await manager.enqueueScrapeJob("libB", "1.0", optionsB); await manager.start(); await vi.advanceTimersByTimeAsync(1); const jobA = await manager.getJob(jobIdA); @@ -398,7 +415,7 @@ describe("PipelineManager", () => { describe("Database Status Integration", () => { it("should update database status when job is enqueued", async () => { const options = { url: "http://example.com", library: "test-lib", version: "1.0" }; - await manager.enqueueJob("test-lib", "1.0", options); + await manager.enqueueScrapeJob("test-lib", "1.0", options); // Should ensure library/version exists and update status to QUEUED expect(mockStore.ensureLibraryAndVersion).toHaveBeenCalledWith("test-lib", "1.0"); @@ -407,7 +424,7 @@ describe("PipelineManager", () => { it("should handle unversioned jobs correctly", async () => { const options = { url: "http://example.com", library: "test-lib", version: "" }; - await manager.enqueueJob("test-lib", null, options); + await manager.enqueueScrapeJob("test-lib", null, options); // Should treat null version as empty string expect(mockStore.ensureLibraryAndVersion).toHaveBeenCalledWith("test-lib", ""); @@ -476,7 +493,7 @@ describe("PipelineManager", () => { it("should map job statuses to database statuses correctly", async () => { // Test that the mapping function works correctly by checking enum values const options = { url: "http://example.com", library: "test-lib", version: "1.0" }; - const jobId = await manager.enqueueJob("test-lib", "1.0", options); + const jobId = await manager.enqueueScrapeJob("test-lib", "1.0", options); // Verify the job was created with correct status const job = await manager.getJob(jobId); @@ -495,7 +512,9 @@ describe("PipelineManager", () => { const options = { url: "http://example.com", library: "test-lib", version: "1.0" }; // Should not throw even if database update fails - await expect(manager.enqueueJob("test-lib", "1.0", options)).resolves.toBeDefined(); + await expect( + manager.enqueueScrapeJob("test-lib", "1.0", options), + ).resolves.toBeDefined(); // Job should still be created in memory despite database error const allJobs = await manager.getJobs(); @@ -549,7 +568,7 @@ describe("PipelineManager", () => { // This should not cause the system to hang try { - const jobId = await manager.enqueueJob("test-lib", "1.0", options); + const jobId = await manager.enqueueScrapeJob("test-lib", "1.0", options); // If it succeeds, verify the job exists if (jobId) { const job = await manager.getJob(jobId); @@ -582,4 +601,134 @@ describe("PipelineManager", () => { expect(cleanupSpy).toHaveBeenCalledTimes(1); }); }); + + // --- Refresh Job Tests --- + describe("enqueueRefreshJob", () => { + it("should successfully enqueue a refresh job with initial queue", async () => { + // Setup: Mock pages and scraper options for an existing version + const mockPages = [ + { id: 1, url: "https://example.com/page1", depth: 0, etag: "etag1" }, + { id: 2, url: "https://example.com/page2", depth: 1, etag: "etag2" }, + { id: 3, url: "https://example.com/page3", depth: 1, etag: "etag3" }, + ]; + + (mockStore.ensureVersion as Mock).mockResolvedValue(456); + (mockStore.getPagesByVersionId as Mock).mockResolvedValue(mockPages); + (mockStore.getScraperOptions as Mock).mockResolvedValue({ + sourceUrl: "https://example.com", + options: { maxDepth: 2 }, + }); + + // Action: Enqueue a refresh job + const jobId = await manager.enqueueRefreshJob("test-lib", "1.0.0"); + + // Assertions: Verify the job was created with correct properties + expect(jobId).toBeDefined(); + expect(typeof jobId).toBe("string"); + + const job = await manager.getJob(jobId); + expect(job).toBeDefined(); + expect(job?.status).toBe(PipelineJobStatus.QUEUED); + expect(job?.library).toBe("test-lib"); + expect(job?.version).toBe("1.0.0"); + + // Verify the scraper options contain an initialQueue with the same number of pages + // Note: initialQueue is part of ScraperOptions but not VersionScraperOptions (storage type) + expect(job?.scraperOptions).toBeDefined(); + const scraperOpts = job?.scraperOptions as any; + expect(scraperOpts?.initialQueue).toBeDefined(); + expect(scraperOpts?.initialQueue).toHaveLength(mockPages.length); + + // Verify maxPages is set to the page count + expect(scraperOpts?.maxPages).toBe(mockPages.length); + }); + + it("should handle unversioned libraries during refresh", async () => { + const mockPages = [ + { id: 1, url: "https://example.com/page1", depth: 0, etag: "etag1" }, + ]; + + (mockStore.ensureVersion as Mock).mockResolvedValue(789); + (mockStore.getPagesByVersionId as Mock).mockResolvedValue(mockPages); + (mockStore.getScraperOptions as Mock).mockResolvedValue({ + sourceUrl: "https://example.com", + options: {}, + }); + + // Action: Enqueue refresh for unversioned library (null/undefined version) + const jobId = await manager.enqueueRefreshJob("unversioned-lib", null); + + // Assertions + const job = await manager.getJob(jobId); + expect(job).toBeDefined(); + expect(job?.library).toBe("unversioned-lib"); + expect(job?.version).toBe(null); // Public API uses null for unversioned + const scraperOpts = job?.scraperOptions as any; + expect(scraperOpts?.initialQueue).toHaveLength(1); + }); + + it("should throw error when refreshing a version with no pages", async () => { + // Setup: Mock empty pages array + (mockStore.ensureVersion as Mock).mockResolvedValue(999); + (mockStore.getPagesByVersionId as Mock).mockResolvedValue([]); + + // Action & Assertion: Should throw with clear error message + await expect(manager.enqueueRefreshJob("empty-lib", "1.0.0")).rejects.toThrow( + "No pages found for empty-lib@1.0.0", + ); + }); + + it("should throw error when refreshing unversioned library with no pages", async () => { + // Setup: Mock empty pages array for unversioned library + (mockStore.ensureVersion as Mock).mockResolvedValue(888); + (mockStore.getPagesByVersionId as Mock).mockResolvedValue([]); + + // Action & Assertion: Should throw with clear error message including "unversioned" + await expect(manager.enqueueRefreshJob("empty-lib", undefined)).rejects.toThrow( + "No pages found for empty-lib@unversioned", + ); + }); + + it("should preserve page depth and etag in initialQueue", async () => { + const mockPages = [ + { id: 10, url: "https://example.com/deep", depth: 5, etag: "deep-etag" }, + { id: 11, url: "https://example.com/shallow", depth: 0, etag: null }, + ]; + + (mockStore.ensureVersion as Mock).mockResolvedValue(111); + (mockStore.getPagesByVersionId as Mock).mockResolvedValue(mockPages); + (mockStore.getScraperOptions as Mock).mockResolvedValue({ + sourceUrl: "https://example.com", + options: {}, + }); + + const jobId = await manager.enqueueRefreshJob("depth-test", "1.0.0"); + const job = await manager.getJob(jobId); + + // Verify initialQueue contains depth and etag information + // Note: initialQueue is part of ScraperOptions but not VersionScraperOptions (storage type) + const scraperOpts = job?.scraperOptions as any; + const queue = scraperOpts?.initialQueue; + expect(queue).toBeDefined(); + expect(queue).toHaveLength(2); + + // Verify deep page + const deepItem = queue?.find( + (item: any) => item.url === "https://example.com/deep", + ); + expect(deepItem).toBeDefined(); + expect(deepItem?.depth).toBe(5); + expect(deepItem?.etag).toBe("deep-etag"); + expect(deepItem?.pageId).toBe(10); + + // Verify shallow page + const shallowItem = queue?.find( + (item: any) => item.url === "https://example.com/shallow", + ); + expect(shallowItem).toBeDefined(); + expect(shallowItem?.depth).toBe(0); + expect(shallowItem?.etag).toBe(null); + expect(shallowItem?.pageId).toBe(11); + }); + }); }); diff --git a/src/pipeline/PipelineManager.ts b/src/pipeline/PipelineManager.ts index 8dd09b3c..be65338c 100644 --- a/src/pipeline/PipelineManager.ts +++ b/src/pipeline/PipelineManager.ts @@ -9,7 +9,7 @@ import { v4 as uuidv4 } from "uuid"; import { ScraperRegistry, ScraperService } from "../scraper"; -import type { ScraperOptions, ScraperProgress } from "../scraper/types"; +import type { ScraperOptions, ScraperProgressEvent } from "../scraper/types"; import type { DocumentManagementService } from "../store"; import { VersionStatus } from "../store/types"; import { DEFAULT_MAX_CONCURRENCY } from "../utils/config"; @@ -243,15 +243,6 @@ export class PipelineManager implements IPipeline { // Normalize version: treat undefined/null as "" (unversioned) const normalizedVersion = version ?? ""; - // Extract URL and convert ScraperOptions to VersionScraperOptions - const { - url, - library: _library, - version: _version, - signal: _signal, - ...versionOptions - } = options; - // Abort any existing QUEUED or RUNNING job for the same library+version const allJobs = await this.getJobs(); const duplicateJobs = allJobs.filter( @@ -299,8 +290,8 @@ export class PipelineManager implements IPipeline { progressMaxPages: 0, errorMessage: null, updatedAt: new Date(), - sourceUrl: url, - scraperOptions: versionOptions, + sourceUrl: options.url, + scraperOptions: options, }; this.jobMap.set(jobId, job); @@ -324,7 +315,7 @@ export class PipelineManager implements IPipeline { /** * Enqueues a refresh job for an existing library version by re-scraping all pages - * and using Etag comparison to skip unchanged content. + * and using ETag comparison to skip unchanged content. */ async enqueueRefreshJob( library: string, @@ -333,82 +324,53 @@ export class PipelineManager implements IPipeline { // Normalize version: treat undefined/null as "" (unversioned) const normalizedVersion = version ?? ""; - // First, check if the library version exists try { + // First, check if the library version exists const versionId = await this.store.ensureVersion({ library, version: normalizedVersion, }); - // Get all pages for this version + // Get all pages for this version with their ETags and depths const pages = await this.store.getPagesByVersionId(versionId); if (pages.length === 0) { throw new Error( - `No pages found for ${library}@${normalizedVersion || "unversioned"}. Cannot refresh an empty version.`, + `No pages found for ${library}@${normalizedVersion || "unversioned"}. Use scrape_docs to index it first.`, ); } logger.info( - `🔄 Starting refresh for ${library}@${normalizedVersion || "unversioned"} with ${pages.length} page(s)`, + `🔄 Preparing refresh job for ${library}@${normalizedVersion || "unversioned"} with ${pages.length} page(s)`, ); - const jobId = uuidv4(); - const abortController = new AbortController(); - let resolveCompletion!: () => void; - let rejectCompletion!: (reason?: unknown) => void; + // Build initialQueue from pages with original depth values + const initialQueue = pages.map((page) => ({ + url: page.url, + depth: page.depth ?? 0, // Use original depth, fallback to 0 for old data + pageId: page.id, + etag: page.etag, + })); - const completionPromise = new Promise((resolve, reject) => { - resolveCompletion = resolve; - rejectCompletion = reject; - }); - // Prevent unhandled rejection warnings if rejection occurs before consumers attach handlers - completionPromise.catch(() => {}); + // Get stored scraper options to retrieve the source URL and other options + const storedOptions = await this.store.getScraperOptions(versionId); - const job: InternalPipelineJob = { - id: jobId, + // Build scraper options with initialQueue and isRefresh flag + const scraperOptions = { + url: storedOptions?.sourceUrl || pages[0].url, // Required but not used when initialQueue is set library, version: normalizedVersion, - status: PipelineJobStatus.QUEUED, - progress: null, - error: null, - createdAt: new Date(), - startedAt: null, - finishedAt: null, - abortController, - completionPromise, - resolveCompletion, - rejectCompletion, - // Database fields (single source of truth) - versionId, - versionStatus: this.mapJobStatusToVersionStatus(PipelineJobStatus.QUEUED), - progressPages: 0, - progressMaxPages: pages.length, - errorMessage: null, - updatedAt: new Date(), - sourceUrl: null, // No single source URL for refresh jobs - scraperOptions: null, - // Add refresh-specific metadata - refreshPages: pages, // Store the pages to refresh + initialQueue, // Pre-populated queue with existing pages + maxPages: pages.length, + isRefresh: true, // Mark this as a refresh operation + ...(storedOptions?.options || {}), // Include stored options if available }; - this.jobMap.set(jobId, job); - this.jobQueue.push(jobId); + // Enqueue as a standard scrape job with the initialQueue logger.info( - `📝 Refresh job enqueued: ${jobId} for ${library}${normalizedVersion ? `@${normalizedVersion}` : " (unversioned)"} with ${pages.length} pages`, + `📝 Enqueueing refresh job for ${library}@${normalizedVersion || "unversioned"}`, ); - - // Update database status to QUEUED - await this.updateJobStatus(job, PipelineJobStatus.QUEUED); - - // Trigger processing if manager is running - if (this.isRunning) { - this._processQueue().catch((error) => { - logger.error(`❌ Error in processQueue during refresh enqueue: ${error}`); - }); - } - - return jobId; + return this.enqueueScrapeJob(library, normalizedVersion, scraperOptions); } catch (error) { logger.error(`❌ Failed to enqueue refresh job: ${error}`); throw error; @@ -742,14 +704,8 @@ export class PipelineManager implements IPipeline { // Store scraper options when job is first queued if (newStatus === PipelineJobStatus.QUEUED && job.scraperOptions) { try { - // Reconstruct ScraperOptions for storage (DocumentStore will filter runtime fields) - const fullOptions = { - url: job.sourceUrl ?? "", - library: job.library, - version: job.version, - ...job.scraperOptions, - }; - await this.store.storeScraperOptions(versionId, fullOptions); + // Pass the complete scraper options (DocumentStore will filter runtime fields) + await this.store.storeScraperOptions(versionId, job.scraperOptions); logger.debug( `Stored scraper options for ${job.library}@${job.version}: ${job.sourceUrl}`, ); @@ -774,7 +730,7 @@ export class PipelineManager implements IPipeline { */ async updateJobProgress( job: InternalPipelineJob, - progress: ScraperProgress, + progress: ScraperProgressEvent, ): Promise { // Update in-memory progress job.progress = progress; diff --git a/src/pipeline/PipelineWorker.test.ts b/src/pipeline/PipelineWorker.test.ts index fa94fe6d..c84bab5d 100644 --- a/src/pipeline/PipelineWorker.test.ts +++ b/src/pipeline/PipelineWorker.test.ts @@ -1,8 +1,7 @@ import { beforeEach, describe, expect, it, type Mock, vi } from "vitest"; import type { ScraperService } from "../scraper"; -import type { ScraperProgress } from "../scraper/types"; +import type { ScrapeResult, ScraperProgressEvent } from "../scraper/types"; import type { DocumentManagementService } from "../store/DocumentManagementService"; -import type { Document } from "../types"; import { PipelineWorker } from "./PipelineWorker"; import type { InternalPipelineJob, PipelineManagerCallbacks } from "./types"; import { PipelineJobStatus } from "./types"; @@ -24,8 +23,9 @@ describe("PipelineWorker", () => { vi.resetAllMocks(); mockStore = { - addDocument: vi.fn().mockResolvedValue(undefined), + addScrapeResult: vi.fn().mockResolvedValue(undefined), removeAllDocuments: vi.fn().mockResolvedValue(undefined), + removeDocumentsByPageId: vi.fn().mockResolvedValue(undefined), }; mockScraperService = { @@ -65,53 +65,56 @@ describe("PipelineWorker", () => { rejectCompletion: vi.fn(), sourceUrl: "http://example.com", scraperOptions: { + url: "http://example.com", + library: "test-lib", + version: "1.0.0", maxPages: 10, maxDepth: 1, }, }; }); - it("should execute job successfully, calling scrape, addDocument, and onJobProgress", async () => { - const mockDoc1: Document = { - content: "doc1", - metadata: { - url: "url1", - title: "Doc 1", - library: mockJob.library, // Add required field - version: mockJob.version, // Add required field - }, + it("should execute job successfully, calling scrape, addScrapeResult, and onJobProgress", async () => { + const mockProcessed1: ScrapeResult = { + textContent: "doc1", + url: "url1", + title: "Doc 1", + contentType: "text/html", + chunks: [], + links: [], + errors: [], }; - const mockDoc2: Document = { - content: "doc2", - metadata: { - url: "url2", - title: "Doc 2", - library: mockJob.library, // Add required field - version: mockJob.version, // Add required field - }, + const mockProcessed2: ScrapeResult = { + textContent: "doc2", + url: "url2", + title: "Doc 2", + contentType: "text/html", + chunks: [], + links: [], + errors: [], }; // Configure mock scrape to yield progress (mockScraperService.scrape as Mock).mockImplementation( async (_options, progressCallback, _signal) => { - const progress1: ScraperProgress = { + const progress1: ScraperProgressEvent = { pagesScraped: 1, totalPages: 2, currentUrl: "url1", depth: 1, maxDepth: 1, - document: mockDoc1, + result: mockProcessed1, totalDiscovered: 0, }; await progressCallback(progress1); - const progress2: ScraperProgress = { + const progress2: ScraperProgressEvent = { pagesScraped: 2, totalPages: 2, currentUrl: "url2", depth: 1, maxDepth: 1, - document: mockDoc2, + result: mockProcessed2, totalDiscovered: 0, }; await progressCallback(progress2); @@ -127,39 +130,38 @@ describe("PipelineWorker", () => { mockJob.version, ); - // Verify scrape was called + // Verify scrape was called with the complete scraper options expect(mockScraperService.scrape).toHaveBeenCalledOnce(); expect(mockScraperService.scrape).toHaveBeenCalledWith( - { - url: mockJob.sourceUrl, - library: mockJob.library, - version: mockJob.version, - ...mockJob.scraperOptions, - }, + mockJob.scraperOptions, // Now passes the complete options directly expect.any(Function), // The progress callback abortController.signal, ); - // Verify addDocument was called for each document - expect(mockStore.addDocument).toHaveBeenCalledTimes(2); - expect(mockStore.addDocument).toHaveBeenCalledWith(mockJob.library, mockJob.version, { - pageContent: mockDoc1.content, - metadata: mockDoc1.metadata, - }); - expect(mockStore.addDocument).toHaveBeenCalledWith(mockJob.library, mockJob.version, { - pageContent: mockDoc2.content, - metadata: mockDoc2.metadata, - }); + // Verify addScrapeResult was called for each document + expect(mockStore.addScrapeResult).toHaveBeenCalledTimes(2); + expect(mockStore.addScrapeResult).toHaveBeenCalledWith( + mockJob.library, + mockJob.version, + 1, + mockProcessed1, + ); + expect(mockStore.addScrapeResult).toHaveBeenCalledWith( + mockJob.library, + mockJob.version, + 1, + mockProcessed2, + ); // Verify onJobProgress was called expect(mockCallbacks.onJobProgress).toHaveBeenCalledTimes(2); expect(mockCallbacks.onJobProgress).toHaveBeenCalledWith( mockJob, - expect.objectContaining({ document: mockDoc1 }), + expect.objectContaining({ result: mockProcessed1 }), ); expect(mockCallbacks.onJobProgress).toHaveBeenCalledWith( mockJob, - expect.objectContaining({ document: mockDoc2 }), + expect.objectContaining({ result: mockProcessed2 }), ); // Verify job progress object was NOT updated directly by worker @@ -178,67 +180,81 @@ describe("PipelineWorker", () => { // Verify dependencies were called appropriately expect(mockScraperService.scrape).toHaveBeenCalledOnce(); - expect(mockStore.addDocument).not.toHaveBeenCalled(); + expect(mockStore.addScrapeResult).not.toHaveBeenCalled(); expect(mockCallbacks.onJobProgress).not.toHaveBeenCalled(); expect(mockCallbacks.onJobError).not.toHaveBeenCalled(); }); - it("should call onJobError and continue if store.addDocument fails", async () => { - const mockDoc: Document = { - content: "doc1", - metadata: { url: "url1", title: "Doc 1", library: "test-lib", version: "1.0.0" }, + it("should call onJobError and continue if store.addScrapeResult fails", async () => { + const mockProcessed: ScrapeResult = { + textContent: "doc1", + url: "url1", + title: "Doc 1", + contentType: "text/html", + chunks: [], + links: [], + errors: [], }; const storeError = new Error("Database error"); // Simulate scrape yielding one document (mockScraperService.scrape as Mock).mockImplementation( async (_options, progressCallback, _signal) => { - const progress: ScraperProgress = { + const progress: ScraperProgressEvent = { pagesScraped: 1, totalPages: 1, currentUrl: "url1", depth: 1, maxDepth: 1, - document: mockDoc, + result: mockProcessed, totalDiscovered: 0, }; await progressCallback(progress); }, ); - // Simulate addDocument failing - (mockStore.addDocument as Mock).mockRejectedValue(storeError); + // Simulate addScrapeResult failing + (mockStore.addScrapeResult as Mock).mockRejectedValue(storeError); // Execute the job - should complete despite the error await expect(worker.executeJob(mockJob, mockCallbacks)).resolves.toBeUndefined(); // Verify scrape was called expect(mockScraperService.scrape).toHaveBeenCalledOnce(); - // Verify addDocument was called - expect(mockStore.addDocument).toHaveBeenCalledOnce(); + // Verify addScrapeResult was called + expect(mockStore.addScrapeResult).toHaveBeenCalledOnce(); // Verify onJobProgress was called expect(mockCallbacks.onJobProgress).toHaveBeenCalledOnce(); - // Verify onJobError was called + // Verify onJobError was called with the page that failed expect(mockCallbacks.onJobError).toHaveBeenCalledOnce(); - expect(mockCallbacks.onJobError).toHaveBeenCalledWith(mockJob, storeError, mockDoc); + expect(mockCallbacks.onJobError).toHaveBeenCalledWith( + mockJob, + storeError, + mockProcessed, + ); }); it("should throw CancellationError if cancelled during scrape progress", async () => { - const mockDoc: Document = { - content: "doc1", - metadata: { url: "url1", title: "Doc 1", library: "test-lib", version: "1.0.0" }, + const mockProcessed: ScrapeResult = { + textContent: "doc1", + url: "url1", + title: "Doc 1", + contentType: "text/html", + chunks: [], + links: [], + errors: [], }; // Simulate scrape checking signal and throwing (mockScraperService.scrape as Mock).mockImplementation( async (_options, progressCallback, _signal) => { - const progress: ScraperProgress = { + const progress: ScraperProgressEvent = { pagesScraped: 1, totalPages: 2, currentUrl: "url1", depth: 1, maxDepth: 1, - document: mockDoc, + result: mockProcessed, totalDiscovered: 0, }; // Simulate cancellation happening *before* progress is processed by worker @@ -259,8 +275,8 @@ describe("PipelineWorker", () => { // Verify scrape was called expect(mockScraperService.scrape).toHaveBeenCalledOnce(); - // Verify addDocument was NOT called - expect(mockStore.addDocument).not.toHaveBeenCalled(); + // Verify addScrapeResult was NOT called + expect(mockStore.addScrapeResult).not.toHaveBeenCalled(); // Verify onJobProgress was NOT called because cancellation check happens first expect(mockCallbacks.onJobProgress).not.toHaveBeenCalled(); // Verify onJobError was NOT called @@ -289,8 +305,50 @@ describe("PipelineWorker", () => { // Verify scrape was called (now only once) expect(mockScraperService.scrape).toHaveBeenCalledOnce(); // Verify other callbacks not called - expect(mockStore.addDocument).not.toHaveBeenCalled(); + expect(mockStore.addScrapeResult).not.toHaveBeenCalled(); expect(mockCallbacks.onJobProgress).not.toHaveBeenCalled(); expect(mockCallbacks.onJobError).not.toHaveBeenCalled(); }); + + it("should fail the job if document deletion fails during refresh", async () => { + const deletionError = new Error("Database deletion failed"); + + // Simulate scrape yielding a deletion event (404 page) + (mockScraperService.scrape as Mock).mockImplementation( + async (_options, progressCallback, _signal) => { + const progress: ScraperProgressEvent = { + pagesScraped: 1, + totalPages: 1, + currentUrl: "url1", + depth: 1, + maxDepth: 1, + deleted: true, // This is a deletion event + result: null, + pageId: 123, // Page ID to delete + totalDiscovered: 0, + }; + await progressCallback(progress); + }, + ); + + // Simulate removeDocumentsByPageId failing + (mockStore.removeDocumentsByPageId as Mock).mockRejectedValue(deletionError); + + // Execute the job - should fail due to deletion error + await expect(worker.executeJob(mockJob, mockCallbacks)).rejects.toThrow( + "Database deletion failed", + ); + + // Verify scrape was called + expect(mockScraperService.scrape).toHaveBeenCalledOnce(); + // Verify deletion was attempted + expect(mockStore.removeDocumentsByPageId).toHaveBeenCalledWith(123); + // Verify onJobProgress was called + expect(mockCallbacks.onJobProgress).toHaveBeenCalledOnce(); + // Verify onJobError was called with the deletion error + expect(mockCallbacks.onJobError).toHaveBeenCalledOnce(); + expect(mockCallbacks.onJobError).toHaveBeenCalledWith(mockJob, deletionError); + // Verify addScrapeResult was NOT called (deletion failed before that) + expect(mockStore.addScrapeResult).not.toHaveBeenCalled(); + }); }); diff --git a/src/pipeline/PipelineWorker.ts b/src/pipeline/PipelineWorker.ts index ae3c3977..dcdbb4ab 100644 --- a/src/pipeline/PipelineWorker.ts +++ b/src/pipeline/PipelineWorker.ts @@ -1,5 +1,5 @@ import type { ScraperService } from "../scraper"; -import type { ScraperProgress } from "../scraper/types"; +import type { ScraperProgressEvent } from "../scraper/types"; import type { DocumentManagementService } from "../store"; import { logger } from "../utils/logger"; import { CancellationError } from "./errors"; @@ -29,37 +29,29 @@ export class PipelineWorker { job: InternalPipelineJob, callbacks: PipelineManagerCallbacks, ): Promise { - const { - id: jobId, - library, - version, - sourceUrl, - scraperOptions, - abortController, - } = job; + const { id: jobId, library, version, scraperOptions, abortController } = job; const signal = abortController.signal; logger.debug(`[${jobId}] Worker starting job for ${library}@${version}`); try { // Clear existing documents for this library/version before scraping - await this.store.removeAllDocuments(library, version); - logger.info( - `💾 Cleared store for ${library}@${version || "[no version]"} before scraping.`, - ); - - // Construct runtime options from job context + stored configuration - const runtimeOptions = { - url: sourceUrl ?? "", - library, - version, - ...scraperOptions, - }; + // Skip this step for refresh operations to preserve existing data + if (!scraperOptions.isRefresh) { + await this.store.removeAllDocuments(library, version); + logger.info( + `💾 Cleared store for ${library}@${version || "[no version]"} before scraping.`, + ); + } else { + logger.info( + `🔄 Refresh operation - preserving existing data for ${library}@${version || "[no version]"}.`, + ); + } // --- Core Job Logic --- await this.scraperService.scrape( - runtimeOptions, - async (progress: ScraperProgress) => { + scraperOptions, + async (progress: ScraperProgressEvent) => { // Check for cancellation signal before processing each document if (signal.aborted) { throw new CancellationError("Job cancelled during scraping progress"); @@ -69,27 +61,55 @@ export class PipelineWorker { // Report progress via manager's callback (single source of truth) await callbacks.onJobProgress?.(job, progress); - if (progress.document) { + // Handle deletion events (404 during refresh or broken links) + if (progress.deleted && progress.pageId) { try { - await this.store.addDocument(library, version, { - pageContent: progress.document.content, - metadata: { - ...progress.document.metadata, - mimeType: progress.document.contentType, // Pass contentType as mimeType in metadata - }, - }); + await this.store.removeDocumentsByPageId(progress.pageId); logger.debug( - `[${jobId}] Stored document: ${progress.document.metadata.url}`, + `[${jobId}] Deleted documents for page ${progress.pageId}: ${progress.currentUrl}`, + ); + } catch (docError) { + logger.error( + `❌ [${jobId}] Failed to delete documents for page ${progress.pageId}: ${docError}`, + ); + + // Report the error and fail the job to ensure data integrity + const error = + docError instanceof Error ? docError : new Error(String(docError)); + await callbacks.onJobError?.(job, error); + // Re-throw to fail the job - deletion failures indicate serious database issues + // and leaving orphaned documents would compromise index accuracy + throw error; + } + } + // Handle successful content processing + else if (progress.result) { + try { + // For refresh operations, delete old documents before adding new ones + if (progress.pageId) { + await this.store.removeDocumentsByPageId(progress.pageId); + logger.debug( + `[${jobId}] Refreshing documents for page ${progress.pageId}: ${progress.currentUrl}`, + ); + } + + // Add the processed content to the store + await this.store.addScrapeResult( + library, + version, + progress.depth, + progress.result, ); + logger.debug(`[${jobId}] Stored processed content: ${progress.currentUrl}`); } catch (docError) { logger.error( - `❌ [${jobId}] Failed to store document ${progress.document.metadata.url}: ${docError}`, + `❌ [${jobId}] Failed to process content ${progress.currentUrl}: ${docError}`, ); // Report document-specific errors via manager's callback await callbacks.onJobError?.( job, docError instanceof Error ? docError : new Error(String(docError)), - progress.document, + progress.result, ); // Decide if a single document error should fail the whole job // For now, we log and continue. To fail, re-throw here. diff --git a/src/pipeline/trpc/router.ts b/src/pipeline/trpc/router.ts index d7a41cd6..26d218fd 100644 --- a/src/pipeline/trpc/router.ts +++ b/src/pipeline/trpc/router.ts @@ -31,12 +31,17 @@ const optionalTrimmed = z.preprocess( z.string().min(1).optional().nullable(), ); -const enqueueInput = z.object({ +const enqueueScrapeInput = z.object({ library: nonEmptyTrimmed, version: optionalTrimmed, options: z.custom(), }); +const enqueueRefreshInput = z.object({ + library: nonEmptyTrimmed, + version: optionalTrimmed, +}); + const jobIdInput = z.object({ id: z.string().min(1) }); const getJobsInput = z.object({ @@ -47,17 +52,17 @@ const getJobsInput = z.object({ export function createPipelineRouter(trpc: unknown) { const tt = trpc as typeof t; return tt.router({ - enqueueJob: tt.procedure - .input(enqueueInput) + enqueueScrapeJob: tt.procedure + .input(enqueueScrapeInput) .mutation( async ({ ctx, input, }: { ctx: PipelineTrpcContext; - input: z.infer; + input: z.infer; }) => { - const jobId = await ctx.pipeline.enqueueJob( + const jobId = await ctx.pipeline.enqueueScrapeJob( input.library, input.version ?? null, input.options, @@ -83,6 +88,25 @@ export function createPipelineRouter(trpc: unknown) { }, ), + enqueueRefreshJob: tt.procedure + .input(enqueueRefreshInput) + .mutation( + async ({ + ctx, + input, + }: { + ctx: PipelineTrpcContext; + input: z.infer; + }) => { + const jobId = await ctx.pipeline.enqueueRefreshJob( + input.library, + input.version ?? null, + ); + + return { jobId }; + }, + ), + getJob: tt.procedure .input(jobIdInput) .query( diff --git a/src/pipeline/types.ts b/src/pipeline/types.ts index 4e3e5b43..2ac6f893 100644 --- a/src/pipeline/types.ts +++ b/src/pipeline/types.ts @@ -1,6 +1,9 @@ -import type { ScraperProgress } from "../scraper/types"; -import type { VersionScraperOptions, VersionStatus } from "../store/types"; -import type { Document } from "../types"; // Use local Document type +import type { + ScrapeResult, + ScraperOptions, + ScraperProgressEvent, +} from "../scraper/types"; +import type { VersionStatus } from "../store/types"; /** * Represents the possible states of a pipeline job. @@ -28,7 +31,7 @@ export interface PipelineJob { /** Current pipeline status of the job. */ status: PipelineJobStatus; /** Detailed progress information. */ - progress: ScraperProgress | null; + progress: ScraperProgressEvent | null; /** Error information if the job failed. */ error: { message: string } | null; /** Timestamp when the job was created. */ @@ -52,18 +55,24 @@ export interface PipelineJob { /** Original scraping URL. */ sourceUrl: string | null; /** Stored scraper options for reproducibility. */ - scraperOptions: VersionScraperOptions | null; + scraperOptions: ScraperOptions | null; } /** * Internal pipeline job representation used within PipelineManager. * Contains non-serializable fields for job management and control. + * + * Note: scraperOptions is required (non-nullable) for internal jobs as they + * always have complete runtime configuration available. */ -export interface InternalPipelineJob extends Omit { +export interface InternalPipelineJob + extends Omit { /** The library version associated with the job (internal uses string). */ version: string; /** Error object if the job failed. */ error: Error | null; + /** Complete scraper options with runtime configuration. */ + scraperOptions: ScraperOptions; /** AbortController to signal cancellation. */ abortController: AbortController; /** Promise that resolves/rejects when the job finishes. */ @@ -82,11 +91,14 @@ export interface PipelineManagerCallbacks { /** Callback triggered when a job's status changes. */ onJobStatusChange?: (job: InternalPipelineJob) => Promise; /** Callback triggered when a job makes progress. */ - onJobProgress?: (job: InternalPipelineJob, progress: ScraperProgress) => Promise; + onJobProgress?: ( + job: InternalPipelineJob, + progress: ScraperProgressEvent, + ) => Promise; /** Callback triggered when a job encounters an error during processing (e.g., storing a doc). */ onJobError?: ( job: InternalPipelineJob, error: Error, - document?: Document, + page?: ScrapeResult, ) => Promise; } diff --git a/src/scraper/ScraperService.test.ts b/src/scraper/ScraperService.test.ts index 6c490284..8faa842b 100644 --- a/src/scraper/ScraperService.test.ts +++ b/src/scraper/ScraperService.test.ts @@ -3,7 +3,7 @@ import type { ProgressCallback } from "../types"; import { ScraperError } from "../utils/errors"; import type { ScraperRegistry } from "./ScraperRegistry"; import { ScraperService } from "./ScraperService"; -import type { ScraperOptions, ScraperProgress } from "./types"; +import type { ScraperOptions, ScraperProgressEvent } from "./types"; vi.mock("../utils/logger"); @@ -27,7 +27,7 @@ describe("ScraperService", () => { maxPages: 10, maxDepth: 1, }; - const progressCallback: ProgressCallback = vi.fn(); + const progressCallback: ProgressCallback = vi.fn(); mockRegistry.getStrategy.mockReturnValue(mockStrategy); // Call scrape without a signal (it's optional) @@ -51,7 +51,7 @@ describe("ScraperService", () => { maxPages: 10, maxDepth: 1, }; - const progressCallback: ProgressCallback = vi.fn(); + const progressCallback: ProgressCallback = vi.fn(); mockRegistry.getStrategy.mockReturnValue(mockStrategy); // Call scrape without a signal @@ -74,7 +74,7 @@ describe("ScraperService", () => { maxPages: 10, maxDepth: 1, }; - const progressCallback: ProgressCallback = vi.fn(); + const progressCallback: ProgressCallback = vi.fn(); mockRegistry.getStrategy.mockReturnValue(mockStrategy); // Call scrape without a signal @@ -98,7 +98,7 @@ describe("ScraperService", () => { maxPages: 10, maxDepth: 1, }; - const progressCallback: ProgressCallback = vi.fn(); + const progressCallback: ProgressCallback = vi.fn(); mockRegistry.getStrategy.mockReturnValue(null); @@ -117,7 +117,7 @@ describe("ScraperService", () => { maxPages: 10, maxDepth: 1, }; - const progressCallback: ProgressCallback = vi.fn(); + const progressCallback: ProgressCallback = vi.fn(); mockRegistry.getStrategy.mockReturnValue(mockStrategy); mockStrategy.scrape.mockRejectedValue(new Error("Strategy error")); @@ -138,7 +138,7 @@ describe("ScraperService", () => { maxPages: 1, maxDepth: 1, }; - const progressCallback: ProgressCallback = vi.fn(); + const progressCallback: ProgressCallback = vi.fn(); // Mock a strategy that would handle JSON files const jsonStrategy = { diff --git a/src/scraper/ScraperService.ts b/src/scraper/ScraperService.ts index 853b3f12..c88e5334 100644 --- a/src/scraper/ScraperService.ts +++ b/src/scraper/ScraperService.ts @@ -1,7 +1,7 @@ import type { ProgressCallback } from "../types"; import { ScraperError } from "../utils/errors"; import type { ScraperRegistry } from "./ScraperRegistry"; -import type { ScraperOptions, ScraperProgress } from "./types"; +import type { ScraperOptions, ScraperProgressEvent } from "./types"; /** * Orchestrates document scraping operations using registered scraping strategies. @@ -20,7 +20,7 @@ export class ScraperService { */ async scrape( options: ScraperOptions, - progressCallback: ProgressCallback, + progressCallback: ProgressCallback, signal?: AbortSignal, // Add optional signal parameter ): Promise { // Find strategy for this URL diff --git a/src/scraper/fetcher/BrowserFetcher.ts b/src/scraper/fetcher/BrowserFetcher.ts index d9db4df7..37e8ffde 100644 --- a/src/scraper/fetcher/BrowserFetcher.ts +++ b/src/scraper/fetcher/BrowserFetcher.ts @@ -3,7 +3,12 @@ import { ScraperError } from "../../utils/errors"; import { logger } from "../../utils/logger"; import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; import { FingerprintGenerator } from "./FingerprintGenerator"; -import type { ContentFetcher, FetchOptions, RawContent } from "./types"; +import { + type ContentFetcher, + type FetchOptions, + FetchStatus, + type RawContent, +} from "./types"; /** * Fetches content using a headless browser (Playwright). @@ -82,6 +87,7 @@ export class BrowserFetcher implements ContentFetcher { encoding: undefined, // Browser handles encoding automatically source: finalUrl, etag, + status: FetchStatus.SUCCESS, } satisfies RawContent; } catch (error) { if (options?.signal?.aborted) { diff --git a/src/scraper/fetcher/FileFetcher.test.ts b/src/scraper/fetcher/FileFetcher.test.ts index 546618d6..5a7dac3a 100644 --- a/src/scraper/fetcher/FileFetcher.test.ts +++ b/src/scraper/fetcher/FileFetcher.test.ts @@ -112,10 +112,33 @@ describe("FileFetcher", () => { expect(mdResult.mimeType).toBe("text/markdown"); }); - it("should throw error if file does not exist", async () => { + it("should return status NOT_FOUND if file does not exist", async () => { const fetcher = new FileFetcher(); - await expect(fetcher.fetch("file:///path/to/file.txt")).rejects.toThrow(ScraperError); + const result = await fetcher.fetch("file:///path/to/nonexistent-file.txt"); + expect(result.status).toBe("not_found"); + }); + + it("should throw ScraperError for other file system errors", async () => { + const fetcher = new FileFetcher(); + const filePath = "/path/to/permission-denied.txt"; + + // Create the file in the virtual filesystem first + vol.fromJSON({ + [filePath]: "test content", + }); + + // Simulate a permission error by mocking stat to succeed but readFile to fail + const permissionError = new Error("EACCES: permission denied"); + (permissionError as NodeJS.ErrnoException).code = "EACCES"; + const readFileSpy = vi + .spyOn(vol.promises, "readFile") + .mockRejectedValue(permissionError); + + await expect(fetcher.fetch(`file://${filePath}`)).rejects.toThrow(ScraperError); + + // Restore the spy + readFileSpy.mockRestore(); }); it("should only handle file protocol", async () => { diff --git a/src/scraper/fetcher/FileFetcher.ts b/src/scraper/fetcher/FileFetcher.ts index 727c40d3..f7a6129c 100644 --- a/src/scraper/fetcher/FileFetcher.ts +++ b/src/scraper/fetcher/FileFetcher.ts @@ -2,7 +2,12 @@ import crypto from "node:crypto"; import fs from "node:fs/promises"; import { ScraperError } from "../../utils/errors"; import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; -import type { ContentFetcher, FetchOptions, RawContent } from "./types"; +import { + type ContentFetcher, + type FetchOptions, + FetchStatus, + type RawContent, +} from "./types"; /** * Fetches content from local file system. @@ -15,8 +20,9 @@ export class FileFetcher implements ContentFetcher { /** * Fetches the content of a file given a file:// URL, decoding percent-encoded paths as needed. * Uses enhanced MIME type detection for better source code file recognition. + * Supports conditional fetching via ETag comparison for efficient refresh operations. */ - async fetch(source: string, _options?: FetchOptions): Promise { + async fetch(source: string, options?: FetchOptions): Promise { // Remove the file:// protocol prefix and handle both file:// and file:/// formats let filePath = source.replace(/^file:\/\/\/?/, ""); @@ -29,30 +35,54 @@ export class FileFetcher implements ContentFetcher { } try { - const [content, stats] = await Promise.all([ - fs.readFile(filePath), - fs.stat(filePath), - ]); + const stats = await fs.stat(filePath); - // Use enhanced MIME type detection that properly handles source code files - const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath); - const mimeType = detectedMimeType || "application/octet-stream"; - - // Generate pseudo-ETag from last modified time - const etag = crypto + // Generate current ETag from last modified time + const currentEtag = crypto .createHash("md5") .update(stats.mtime.toISOString()) .digest("hex"); + // Check if file has been modified (ETag comparison) + if (options?.etag && options.etag === currentEtag) { + // File hasn't changed - return NOT_MODIFIED status + return { + content: Buffer.from(""), + mimeType: "text/plain", + source, + etag: currentEtag, + lastModified: stats.mtime.toISOString(), + status: FetchStatus.NOT_MODIFIED, + }; + } + + // File is new or has been modified - read the content + const content = await fs.readFile(filePath); + + // Use enhanced MIME type detection that properly handles source code files + const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath); + const mimeType = detectedMimeType || "application/octet-stream"; + return { content, mimeType, source, - etag, + etag: currentEtag, lastModified: stats.mtime.toISOString(), + status: FetchStatus.SUCCESS, // Don't assume charset for text files - let the pipeline detect it }; } catch (error: unknown) { + // Check for file not found error + if ((error as NodeJS.ErrnoException).code === "ENOENT") { + return { + content: Buffer.from(""), + mimeType: "text/plain", + source, + status: FetchStatus.NOT_FOUND, + }; + } + // For all other errors, throw a ScraperError throw new ScraperError( `Failed to read file ${filePath}: ${ (error as { message?: string }).message ?? "Unknown error" diff --git a/src/scraper/fetcher/HttpFetcher.test.ts b/src/scraper/fetcher/HttpFetcher.test.ts index 5fc9e29f..c8d731c8 100644 --- a/src/scraper/fetcher/HttpFetcher.test.ts +++ b/src/scraper/fetcher/HttpFetcher.test.ts @@ -292,8 +292,8 @@ describe("HttpFetcher", () => { it("should not retry on non-retryable HTTP status codes", async () => { const fetcher = new HttpFetcher(); - // Test various non-retryable status codes - const nonRetryableStatuses = [400, 401, 403, 404, 405, 410]; + // Test various non-retryable status codes (excluding 404 which has special handling) + const nonRetryableStatuses = [400, 401, 403, 405, 410]; for (const status of nonRetryableStatuses) { mockedAxios.get.mockReset(); @@ -310,6 +310,20 @@ describe("HttpFetcher", () => { } }); + it("should return not_found status for 404 responses", async () => { + const fetcher = new HttpFetcher(); + mockedAxios.get.mockRejectedValue({ response: { status: 404 } }); + + const result = await fetcher.fetch("https://example.com", { + maxRetries: 2, + retryDelay: 1, + }); + + // 404 should return result with not_found status instead of throwing + expect(result.status).toBe("not_found"); + expect(mockedAxios.get).toHaveBeenCalledTimes(1); // No retries + }); + it("should retry on undefined status (network errors)", async () => { const fetcher = new HttpFetcher(); // Simulate network error without response object @@ -363,11 +377,12 @@ describe("HttpFetcher", () => { const fetcher = new HttpFetcher(); mockedAxios.get.mockRejectedValue({ response: { status: 404 } }); - await expect( - fetcher.fetch("https://example.com", { - retryDelay: 1, // Use minimal delay - }), - ).rejects.toThrow(ScraperError); + const result = await fetcher.fetch("https://example.com", { + retryDelay: 1, // Use minimal delay + }); + + // Should return result with error status instead of throwing + expect(result.status).toBe("not_found"); expect(mockedAxios.get).toHaveBeenCalledTimes(1); }); diff --git a/src/scraper/fetcher/HttpFetcher.ts b/src/scraper/fetcher/HttpFetcher.ts index 32846b4e..b9bfe849 100644 --- a/src/scraper/fetcher/HttpFetcher.ts +++ b/src/scraper/fetcher/HttpFetcher.ts @@ -6,7 +6,12 @@ import { ChallengeError, RedirectError, ScraperError } from "../../utils/errors" import { logger } from "../../utils/logger"; import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; import { FingerprintGenerator } from "./FingerprintGenerator"; -import type { ContentFetcher, FetchOptions, RawContent } from "./types"; +import { + type ContentFetcher, + type FetchOptions, + FetchStatus, + type RawContent, +} from "./types"; /** * Fetches content from remote sources using HTTP/HTTPS. @@ -116,11 +121,16 @@ export class HttpFetcher implements ContentFetcher { for (let attempt = 0; attempt <= maxRetries; attempt++) { try { const fingerprint = this.fingerprintGenerator.generateHeaders(); - const headers = { + const headers: Record = { ...fingerprint, ...options?.headers, // User-provided headers override generated ones }; + // Add If-None-Match header for conditional requests if ETag is provided + if (options?.etag) { + headers["If-None-Match"] = options.etag; + } + const config: AxiosRequestConfig = { responseType: "arraybuffer", headers: { @@ -138,6 +148,17 @@ export class HttpFetcher implements ContentFetcher { const response = await axios.get(source, config); + // Handle 304 Not Modified responses for conditional requests + if (response.status === 304) { + logger.debug(`🔄 Content not modified (304): ${source}`); + return { + content: Buffer.from(""), + mimeType: "text/plain", + source: source, + status: FetchStatus.NOT_MODIFIED, + } satisfies RawContent; + } + const contentTypeHeader = response.headers["content-type"]; const { mimeType, charset } = MimeTypeUtils.parseContentType(contentTypeHeader); const contentEncoding = response.headers["content-encoding"]; @@ -182,6 +203,7 @@ export class HttpFetcher implements ContentFetcher { source: finalUrl, etag, lastModified: lastModifiedISO, + status: FetchStatus.SUCCESS, } satisfies RawContent; } catch (error: unknown) { const axiosError = error as AxiosError; @@ -194,6 +216,17 @@ export class HttpFetcher implements ContentFetcher { throw new CancellationError("HTTP fetch cancelled"); } + // Handle 404 Not Found - return special status for refresh operations + if (status === 404) { + logger.debug(`❌ Resource not found (404): ${source}`); + return { + content: Buffer.from(""), + mimeType: "text/plain", + source: source, + status: FetchStatus.NOT_FOUND, + } satisfies RawContent; + } + // Handle redirect errors (status codes 301, 302, 303, 307, 308) if (!followRedirects && status && status >= 300 && status < 400) { const location = axiosError.response?.headers?.location; diff --git a/src/scraper/fetcher/types.ts b/src/scraper/fetcher/types.ts index 7f401af1..3769d752 100644 --- a/src/scraper/fetcher/types.ts +++ b/src/scraper/fetcher/types.ts @@ -1,3 +1,29 @@ +/** + * Semantic status of a fetch operation, abstracting HTTP status codes + * into meaningful states for content processing. + */ +export enum FetchStatus { + /** + * Content was successfully fetched (HTTP 200 or new file). + * The content field will contain the fetched data. + */ + SUCCESS = "success", + + /** + * Content has not been modified since the last fetch (HTTP 304). + * The content field will be empty. Occurs when etag is provided + * in FetchOptions and matches the server's current ETag. + */ + NOT_MODIFIED = "not_modified", + + /** + * The resource was not found (HTTP 404 or file doesn't exist). + * The content field will be empty. In refresh operations, + * this indicates the page should be removed from the index. + */ + NOT_FOUND = "not_found", +} + /** * Raw content fetched from a source before processing. * Includes metadata about the content for proper processing. @@ -32,6 +58,14 @@ export interface RawContent { * For local files, this is the file modification time. */ lastModified?: string; + /** + * Semantic status of the fetch operation. + * Abstracts HTTP status codes into meaningful states: + * - SUCCESS: Content was fetched successfully + * - NOT_MODIFIED: Content unchanged since last fetch (conditional request) + * - NOT_FOUND: Resource doesn't exist (should be removed from index) + */ + status: FetchStatus; } /** @@ -50,6 +84,12 @@ export interface FetchOptions { signal?: AbortSignal; /** Whether to follow HTTP redirects (3xx responses) */ followRedirects?: boolean; + /** + * ETag value for conditional requests. + * When provided, the fetcher will include an If-None-Match header + * and may return a 304 Not Modified response if content hasn't changed. + */ + etag?: string | null; } /** diff --git a/src/scraper/middleware/HtmlCheerioParserMiddleware.test.ts b/src/scraper/middleware/HtmlCheerioParserMiddleware.test.ts index 1a0d01ff..67fcae0c 100644 --- a/src/scraper/middleware/HtmlCheerioParserMiddleware.test.ts +++ b/src/scraper/middleware/HtmlCheerioParserMiddleware.test.ts @@ -64,8 +64,8 @@ const createMockContext = ( ): MiddlewareContext => { return { content: htmlContent, + contentType: "text/html", source, - metadata: {}, links: [], errors: [], options: { ...createMockScraperOptions(source), ...options }, diff --git a/src/scraper/middleware/HtmlJsExecutorMiddleware.test.ts b/src/scraper/middleware/HtmlJsExecutorMiddleware.test.ts index deee15d2..f8f7a583 100644 --- a/src/scraper/middleware/HtmlJsExecutorMiddleware.test.ts +++ b/src/scraper/middleware/HtmlJsExecutorMiddleware.test.ts @@ -7,7 +7,7 @@ import { type MockedObject, vi, } from "vitest"; -import type { ContentFetcher, RawContent } from "../fetcher/types"; +import { type ContentFetcher, FetchStatus, type RawContent } from "../fetcher/types"; import type { SandboxExecutionOptions, SandboxExecutionResult } from "../utils/sandbox"; import { executeJsInSandbox } from "../utils/sandbox"; import { HtmlJsExecutorMiddleware } from "./HtmlJsExecutorMiddleware"; @@ -37,7 +37,7 @@ describe("HtmlJsExecutorMiddleware", () => { mockContext = { source: "http://example.com", content: "", // Will be set in tests - metadata: {}, + contentType: "text/html", links: [], errors: [], options: { @@ -136,6 +136,7 @@ describe("HtmlJsExecutorMiddleware", () => { content: Buffer.from(mockScriptContent), mimeType: "application/javascript", source: "http://example.com/ext.js", + status: FetchStatus.SUCCESS, }; mockFetcher.fetch.mockResolvedValue(mockRawContent); @@ -192,6 +193,7 @@ describe("HtmlJsExecutorMiddleware", () => { content: "body { color: red; }", mimeType: "text/css", // Incorrect MIME type source: "http://example.com/style.css", + status: FetchStatus.SUCCESS, }; mockFetcher.fetch.mockResolvedValue(mockRawContent); diff --git a/src/scraper/middleware/HtmlLinkExtractorMiddleware.test.ts b/src/scraper/middleware/HtmlLinkExtractorMiddleware.test.ts index c7f604de..6ad2609c 100644 --- a/src/scraper/middleware/HtmlLinkExtractorMiddleware.test.ts +++ b/src/scraper/middleware/HtmlLinkExtractorMiddleware.test.ts @@ -29,8 +29,8 @@ const createMockContext = ( ): MiddlewareContext => { const context: MiddlewareContext = { content: htmlContent || "", + contentType: "text/html", source, - metadata: {}, links: [], errors: [], options: { ...createMockScraperOptions(source), ...options }, diff --git a/src/scraper/middleware/HtmlMetadataExtractorMiddleware.test.ts b/src/scraper/middleware/HtmlMetadataExtractorMiddleware.test.ts index d8f3fb3b..5b567f22 100644 --- a/src/scraper/middleware/HtmlMetadataExtractorMiddleware.test.ts +++ b/src/scraper/middleware/HtmlMetadataExtractorMiddleware.test.ts @@ -29,8 +29,8 @@ const createMockContext = ( ): MiddlewareContext => { const context: MiddlewareContext = { content: htmlContent || "", + contentType: "text/html", source, - metadata: {}, links: [], errors: [], options: { ...createMockScraperOptions(source), ...options }, @@ -52,7 +52,7 @@ describe("HtmlMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("Head Title"); + expect(context.title).toBe("Head Title"); expect(context.errors).toHaveLength(0); // No need to close Cheerio object @@ -67,7 +67,7 @@ describe("HtmlMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("Untitled"); + expect(context.title).toBe("Untitled"); expect(context.errors).toHaveLength(0); // No need to close Cheerio object @@ -82,7 +82,7 @@ describe("HtmlMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("Untitled"); + expect(context.title).toBe("Untitled"); expect(context.errors).toHaveLength(0); // No need to close Cheerio object @@ -98,7 +98,7 @@ describe("HtmlMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("Extra Whitespace Title"); + expect(context.title).toBe("Extra Whitespace Title"); expect(context.errors).toHaveLength(0); // No need to close Cheerio object @@ -113,7 +113,7 @@ describe("HtmlMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBeUndefined(); // Title should not be set + expect(context.title).toBeUndefined(); // Title should not be set expect(warnSpy).toHaveBeenCalledWith( expect.stringContaining("context.dom is missing"), ); @@ -139,7 +139,7 @@ describe("HtmlMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); // Should still call next - expect(context.metadata.title).toBeUndefined(); + expect(context.title).toBeUndefined(); expect(context.errors).toHaveLength(1); // Check if the error message includes the original error's message expect(context.errors[0].message).toContain("Failed to extract metadata from HTML"); diff --git a/src/scraper/middleware/HtmlMetadataExtractorMiddleware.ts b/src/scraper/middleware/HtmlMetadataExtractorMiddleware.ts index ac40062f..75c725f3 100644 --- a/src/scraper/middleware/HtmlMetadataExtractorMiddleware.ts +++ b/src/scraper/middleware/HtmlMetadataExtractorMiddleware.ts @@ -39,7 +39,7 @@ export class HtmlMetadataExtractorMiddleware implements ContentProcessorMiddlewa // Basic cleanup (replace multiple spaces with single space) title = title.replace(/\s+/g, " ").trim(); - context.metadata.title = title; + context.title = title; logger.debug(`Extracted title: "${title}" from ${context.source}`); } catch (error) { logger.error(`❌ Error extracting metadata from ${context.source}: ${error}`); diff --git a/src/scraper/middleware/HtmlNormalizationMiddleware.test.ts b/src/scraper/middleware/HtmlNormalizationMiddleware.test.ts index 6f6092ba..888e9181 100644 --- a/src/scraper/middleware/HtmlNormalizationMiddleware.test.ts +++ b/src/scraper/middleware/HtmlNormalizationMiddleware.test.ts @@ -19,8 +19,8 @@ describe("HtmlNormalizationMiddleware", () => { }; return { content: htmlContent, + contentType: "text/html", source, - metadata: {}, links: [], errors: [], options, @@ -37,8 +37,8 @@ describe("HtmlNormalizationMiddleware", () => { }; const context: MiddlewareContext = { content: "

test

", + contentType: "text/html", source: "https://example.com", - metadata: {}, links: [], errors: [], options, diff --git a/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts b/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts index cca0caca..fd126b20 100644 --- a/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts +++ b/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts @@ -52,8 +52,8 @@ const createPipelineTestContext = ( const fullOptions = { ...createMockScraperOptions(source), ...options }; return { content, + contentType: "text/html", source, - metadata: {}, links: [], errors: [], options: fullOptions, diff --git a/src/scraper/middleware/HtmlPlaywrightMiddleware.ts b/src/scraper/middleware/HtmlPlaywrightMiddleware.ts index 5088bad8..2ec1ce76 100644 --- a/src/scraper/middleware/HtmlPlaywrightMiddleware.ts +++ b/src/scraper/middleware/HtmlPlaywrightMiddleware.ts @@ -587,10 +587,7 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { */ async process(context: MiddlewareContext, next: () => Promise): Promise { // Check if we have a MIME type from the raw content and if it's suitable for HTML processing - const contentType = - context.options?.headers?.["content-type"] || - context.metadata?.contentType || - context.metadata?.mimeType; + const contentType = context.options?.headers?.["content-type"] || context.contentType; // Safety check: If we detect this is definitely not HTML content, skip Playwright if ( diff --git a/src/scraper/middleware/HtmlSanitizerMiddleware.test.ts b/src/scraper/middleware/HtmlSanitizerMiddleware.test.ts index 80e00363..e38116e5 100644 --- a/src/scraper/middleware/HtmlSanitizerMiddleware.test.ts +++ b/src/scraper/middleware/HtmlSanitizerMiddleware.test.ts @@ -33,8 +33,8 @@ const createMockContext = ( const fullOptions = { ...createMockScraperOptions(source), ...options }; const context: MiddlewareContext = { content: htmlContent || "", + contentType: "text/html", source, - metadata: {}, links: [], errors: [], options: fullOptions, diff --git a/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts b/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts index 755f106a..391f0022 100644 --- a/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts +++ b/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts @@ -30,8 +30,8 @@ const createMockContext = ( ): MiddlewareContext => { const context: MiddlewareContext = { content: htmlContent || "", + contentType: "text/html", source, - metadata: {}, links: [], errors: [], options: { ...createMockScraperOptions(source), ...options }, diff --git a/src/scraper/middleware/MarkdownLinkExtractorMiddleware.test.ts b/src/scraper/middleware/MarkdownLinkExtractorMiddleware.test.ts index c8143657..33b485d9 100644 --- a/src/scraper/middleware/MarkdownLinkExtractorMiddleware.test.ts +++ b/src/scraper/middleware/MarkdownLinkExtractorMiddleware.test.ts @@ -28,8 +28,8 @@ const createMockContext = ( ): MiddlewareContext => { return { content: markdownContent, + contentType: "text/markdown", source, - metadata: {}, links: initialLinks, errors: [], options: { ...createMockScraperOptions(source), ...options }, diff --git a/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.test.ts b/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.test.ts index 573e0895..1e4fc8e7 100644 --- a/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.test.ts +++ b/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.test.ts @@ -27,8 +27,8 @@ const createMockContext = ( ): MiddlewareContext => { return { content: markdownContent, + contentType: "text/markdown", source, - metadata: {}, links: [], errors: [], options: { ...createMockScraperOptions(source), ...options }, @@ -45,7 +45,7 @@ describe("MarkdownMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("My Title"); + expect(context.title).toBe("My Title"); expect(context.errors).toHaveLength(0); }); @@ -58,7 +58,7 @@ describe("MarkdownMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("Untitled"); + expect(context.title).toBe("Untitled"); expect(context.errors).toHaveLength(0); }); @@ -71,7 +71,7 @@ describe("MarkdownMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("My Spaced Title"); + expect(context.title).toBe("My Spaced Title"); expect(context.errors).toHaveLength(0); }); @@ -84,7 +84,7 @@ describe("MarkdownMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("First Title"); + expect(context.title).toBe("First Title"); expect(context.errors).toHaveLength(0); }); @@ -97,7 +97,7 @@ describe("MarkdownMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("Untitled"); + expect(context.title).toBe("Untitled"); expect(context.errors).toHaveLength(0); }); @@ -110,7 +110,7 @@ describe("MarkdownMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("The Actual Title"); + expect(context.title).toBe("The Actual Title"); expect(context.errors).toHaveLength(0); }); diff --git a/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.ts b/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.ts index 55eda6e3..0a0c1f7a 100644 --- a/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.ts +++ b/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.ts @@ -16,7 +16,7 @@ export class MarkdownMetadataExtractorMiddleware implements ContentProcessorMidd if (match?.[1]) { title = match[1].trim(); } - context.metadata.title = title; + context.title = title; } catch (error) { context.errors.push( new Error( diff --git a/src/scraper/middleware/types.ts b/src/scraper/middleware/types.ts index 2eadc375..13571e3d 100644 --- a/src/scraper/middleware/types.ts +++ b/src/scraper/middleware/types.ts @@ -6,12 +6,14 @@ import type { ScraperOptions } from "../types"; * Represents the context passed through the middleware pipeline. */ export interface MiddlewareContext { + /** The title of the page or document, extracted during processing */ + title?: string; + /** The MIME type of the content being processed. */ + contentType: string; /** The content being processed (always a string in middleware). */ content: string; /** The original source URL of the content. */ readonly source: string; - /** Extracted metadata (e.g., title). */ - metadata: Record; /** Extracted links from the content. */ links: string[]; /** Errors encountered during processing. */ diff --git a/src/scraper/pipelines/BasePipeline.test.ts b/src/scraper/pipelines/BasePipeline.test.ts index 656b354a..ce4f7b42 100644 --- a/src/scraper/pipelines/BasePipeline.test.ts +++ b/src/scraper/pipelines/BasePipeline.test.ts @@ -2,7 +2,7 @@ import { describe, expect, it, vi } from "vitest"; import type { ContentProcessorMiddleware, MiddlewareContext } from "../middleware/types"; import { BasePipeline } from "./BasePipeline"; -import type { ProcessedContent } from "./types"; +import type { PipelineResult } from "./types"; // Create a concrete subclass of BasePipeline for testing class TestPipeline extends BasePipeline { @@ -10,8 +10,13 @@ class TestPipeline extends BasePipeline { return true; } - async process(): Promise { - return { textContent: "", metadata: {}, links: [], errors: [], chunks: [] }; + async process(): Promise { + return { + textContent: "", + links: [], + errors: [], + chunks: [], + }; } // Expose the protected method for testing @@ -39,21 +44,21 @@ describe("BasePipeline", () => { // Create mock middleware const middleware1 = { process: vi.fn(async (ctx, next) => { - ctx.metadata.step1 = true; + ctx.title = "Step 1"; await next(); }), }; const middleware2 = { process: vi.fn(async (ctx, next) => { - ctx.metadata.step2 = true; + ctx.title = "Step 2"; await next(); }), }; const middleware3 = { process: vi.fn(async (ctx, next) => { - ctx.metadata.step3 = true; + ctx.title = "Step 3"; await next(); }), }; @@ -67,10 +72,8 @@ describe("BasePipeline", () => { expect(middleware2.process).toHaveBeenCalledTimes(1); expect(middleware3.process).toHaveBeenCalledTimes(1); - // Verify the context was updated by each middleware - expect(context.metadata.step1).toBe(true); - expect(context.metadata.step2).toBe(true); - expect(context.metadata.step3).toBe(true); + // Verify the context was updated by the middleware + expect(context.title).toBe("Step 3"); }); it("executeMiddlewareStack catches errors and adds them to context", async () => { diff --git a/src/scraper/pipelines/BasePipeline.ts b/src/scraper/pipelines/BasePipeline.ts index 420a023b..672c10e9 100644 --- a/src/scraper/pipelines/BasePipeline.ts +++ b/src/scraper/pipelines/BasePipeline.ts @@ -1,7 +1,7 @@ import type { ContentFetcher, RawContent } from "../fetcher/types"; import type { ContentProcessorMiddleware, MiddlewareContext } from "../middleware/types"; import type { ScraperOptions } from "../types"; -import type { ContentPipeline, ProcessedContent } from "./types"; +import type { ContentPipeline, PipelineResult } from "./types"; /** * Base class for content processing pipelines. @@ -9,10 +9,10 @@ import type { ContentPipeline, ProcessedContent } from "./types"; */ export class BasePipeline implements ContentPipeline { /** - * Determines if this pipeline can process the given content. + * Determines if this pipeline can process content with the given MIME type. * Must be implemented by derived classes. */ - public canProcess(_rawContent: RawContent): boolean { + public canProcess(_mimeType: string, _content?: Buffer): boolean { throw new Error("Method not implemented."); } @@ -24,7 +24,7 @@ export class BasePipeline implements ContentPipeline { _rawContent: RawContent, _options: ScraperOptions, _fetcher?: ContentFetcher, - ): Promise { + ): Promise { throw new Error("Method not implemented."); } diff --git a/src/scraper/pipelines/HtmlPipeline.charset.test.ts b/src/scraper/pipelines/HtmlPipeline.charset.test.ts index 2881c6ff..701881ec 100644 --- a/src/scraper/pipelines/HtmlPipeline.charset.test.ts +++ b/src/scraper/pipelines/HtmlPipeline.charset.test.ts @@ -1,5 +1,5 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import { ScrapeMode } from "../types"; import { HtmlPipeline } from "./HtmlPipeline"; @@ -41,6 +41,7 @@ describe("HtmlPipeline charset integration", () => { mimeType: "text/html", charset: "utf-8", // Wrong charset from HTTP header source: "https://example.com/test.html", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(rawContent, { @@ -82,6 +83,7 @@ describe("HtmlPipeline charset integration", () => { mimeType: "text/html", charset: "iso-8859-1", // Correct charset from HTTP header source: "https://example.com/test.html", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(rawContent, { @@ -121,6 +123,7 @@ describe("HtmlPipeline charset integration", () => { mimeType: "text/html", // No charset information source: "https://example.com/test.html", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(rawContent, { diff --git a/src/scraper/pipelines/HtmlPipeline.test.ts b/src/scraper/pipelines/HtmlPipeline.test.ts index cc64b788..e8ee3adf 100644 --- a/src/scraper/pipelines/HtmlPipeline.test.ts +++ b/src/scraper/pipelines/HtmlPipeline.test.ts @@ -1,6 +1,6 @@ // Copyright (c) 2025 import { beforeEach, describe, expect, it, vi } from "vitest"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import { HtmlCheerioParserMiddleware } from "../middleware/HtmlCheerioParserMiddleware"; import { HtmlLinkExtractorMiddleware } from "../middleware/HtmlLinkExtractorMiddleware"; import { HtmlMetadataExtractorMiddleware } from "../middleware/HtmlMetadataExtractorMiddleware"; @@ -21,17 +21,14 @@ describe("HtmlPipeline", () => { it("canProcess returns true for text/html", () => { const pipeline = new HtmlPipeline(); - expect(pipeline.canProcess({ mimeType: "text/html" } as RawContent)).toBe(true); - expect(pipeline.canProcess({ mimeType: "application/xhtml+xml" } as RawContent)).toBe( - true, - ); + expect(pipeline.canProcess("text/html")).toBe(true); + expect(pipeline.canProcess("application/xhtml+xml")).toBe(true); }); it("canProcess returns false for non-html", () => { const pipeline = new HtmlPipeline(); - expect(pipeline.canProcess({ mimeType: "text/markdown" } as RawContent)).toBe(false); - // @ts-expect-error - expect(pipeline.canProcess({ mimeType: undefined } as RawContent)).toBe(false); + expect(pipeline.canProcess("text/markdown")).toBe(false); + expect(pipeline.canProcess("")).toBe(false); }); it("process decodes Buffer content with UTF-8 charset", async () => { @@ -41,6 +38,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); // Check that we got some markdown content (exact format depends on the actual middleware) @@ -68,6 +66,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "iso-8859-1", // Explicitly set charset to ISO-8859-1 source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); @@ -86,6 +85,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", // No charset specified source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); // Check that we got some markdown content (exact format depends on the actual middleware) @@ -100,6 +100,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); // Check that we got some markdown content (exact format depends on the actual middleware) @@ -116,6 +117,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-16le", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("abc"); @@ -130,6 +132,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("abc"); @@ -143,6 +146,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("こんにちは世界"); @@ -156,6 +160,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("Привет, мир"); @@ -178,6 +183,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); @@ -189,7 +195,7 @@ describe("HtmlPipeline", () => { expect(HtmlToMarkdownMiddleware.prototype.process).toHaveBeenCalledTimes(1); // Verify the result contains expected data from the actual middleware - expect(result.metadata.title).toBe("Test Title"); + expect(result.title).toBe("Test Title"); expect(result.links).toContain("https://test.link/"); expect(result.textContent).toBeTruthy(); expect(result.textContent).toEqual("This is a [test link](https://test.link/)."); @@ -210,9 +216,10 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); - expect(result.errors.some((e) => e.message === "fail")).toBe(true); + expect(result.errors?.some((e) => e.message === "fail")).toBe(true); }); it("should correctly process HTML through the full standard middleware stack (E2E with spies)", async () => { @@ -242,6 +249,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-8", source: "http://test.example.com", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, { @@ -260,7 +268,7 @@ describe("HtmlPipeline", () => { // Verify the result contains expected data // The exact values will depend on the actual middleware implementations - expect(result.metadata.title).toBe("Test Page"); + expect(result.title).toBe("Test Page"); expect(result.links).toContain("https://example.com/test/link"); // Verify the content was sanitized (no script tags) and converted to markdown diff --git a/src/scraper/pipelines/HtmlPipeline.ts b/src/scraper/pipelines/HtmlPipeline.ts index b691c068..9deb9ee9 100644 --- a/src/scraper/pipelines/HtmlPipeline.ts +++ b/src/scraper/pipelines/HtmlPipeline.ts @@ -18,7 +18,7 @@ import type { ScraperOptions } from "../types"; import { convertToString } from "../utils/buffer"; import { resolveCharset } from "../utils/charset"; import { BasePipeline } from "./BasePipeline"; -import type { ProcessedContent } from "./types"; +import type { PipelineResult } from "./types"; /** * Pipeline for processing HTML content using middleware and semantic splitting with size optimization. @@ -57,15 +57,15 @@ export class HtmlPipeline extends BasePipeline { ); } - canProcess(rawContent: RawContent): boolean { - return MimeTypeUtils.isHtml(rawContent.mimeType); + canProcess(mimeType: string): boolean { + return MimeTypeUtils.isHtml(mimeType); } async process( rawContent: RawContent, options: ScraperOptions, fetcher?: ContentFetcher, - ): Promise { + ): Promise { // Use enhanced charset detection that considers HTML meta tags const resolvedCharset = resolveCharset( rawContent.charset, @@ -76,8 +76,9 @@ export class HtmlPipeline extends BasePipeline { const context: MiddlewareContext = { content: contentString, + contentType: rawContent.mimeType || "text/html", source: rawContent.source, - metadata: {}, + // metadata: {}, links: [], errors: [], options, @@ -99,8 +100,8 @@ export class HtmlPipeline extends BasePipeline { ); return { - textContent: typeof context.content === "string" ? context.content : "", - metadata: context.metadata, + title: context.title, + textContent: context.content, links: context.links, errors: context.errors, chunks, diff --git a/src/scraper/pipelines/JsonPipeline.test.ts b/src/scraper/pipelines/JsonPipeline.test.ts index f9bf9a96..4deb0993 100644 --- a/src/scraper/pipelines/JsonPipeline.test.ts +++ b/src/scraper/pipelines/JsonPipeline.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from "vitest"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import { JsonPipeline } from "./JsonPipeline"; describe("JsonPipeline", () => { @@ -16,47 +16,23 @@ describe("JsonPipeline", () => { describe("canProcess", () => { it("should accept JSON MIME types", () => { - const jsonContent: RawContent = { - content: "{}", - mimeType: "application/json", - charset: "utf-8", - source: "test.json", - }; - - expect(pipeline.canProcess(jsonContent)).toBe(true); + const pipeline = new JsonPipeline(); + expect(pipeline.canProcess("application/json")).toBe(true); }); it("should accept text/json MIME type", () => { - const jsonContent: RawContent = { - content: "{}", - mimeType: "text/json", - charset: "utf-8", - source: "test.json", - }; - - expect(pipeline.canProcess(jsonContent)).toBe(true); + const pipeline = new JsonPipeline(); + expect(pipeline.canProcess("text/json")).toBe(true); }); it("should reject non-JSON MIME types", () => { - const htmlContent: RawContent = { - content: "", - mimeType: "text/html", - charset: "utf-8", - source: "test.html", - }; - - expect(pipeline.canProcess(htmlContent)).toBe(false); + const pipeline = new JsonPipeline(); + expect(pipeline.canProcess("text/html")).toBe(false); }); it("should reject content without MIME type", () => { - const unknownContent: RawContent = { - content: "{}", - mimeType: "", - charset: "utf-8", - source: "test", - }; - - expect(pipeline.canProcess(unknownContent)).toBe(false); + const pipeline = new JsonPipeline(); + expect(pipeline.canProcess("")).toBe(false); }); }); @@ -67,19 +43,20 @@ describe("JsonPipeline", () => { mimeType: "application/json", charset: "utf-8", source: "user.json", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(jsonContent, baseOptions); expect(result.textContent).toBe(jsonContent.content); - expect(result.metadata.title).toBe("John"); // extracted from name field - expect(result.metadata.description).toBeUndefined(); // no description field found - expect(result.metadata.isValidJson).toBe(true); - expect(result.metadata.jsonStructure).toEqual({ - type: "object", - depth: 1, - propertyCount: 2, - }); + expect(result.title).toBe("John"); // extracted from name field + // expect(result.metadata.description).toBeUndefined(); // no description field found + // expect(result.metadata.isValidJson).toBe(true); + // expect(result.metadata.jsonStructure).toEqual({ + // type: "object", + // depth: 1, + // propertyCount: 2, + // }); expect(result.links).toHaveLength(0); expect(result.errors).toHaveLength(0); }); @@ -90,19 +67,20 @@ describe("JsonPipeline", () => { mimeType: "application/json", charset: "utf-8", source: "numbers.json", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(jsonContent, baseOptions); expect(result.textContent).toBe(jsonContent.content); - expect(result.metadata.title).toBeUndefined(); // no title field in array - expect(result.metadata.description).toBeUndefined(); // no description field in array - expect(result.metadata.isValidJson).toBe(true); - expect(result.metadata.jsonStructure).toEqual({ - type: "array", - depth: 1, - itemCount: 3, - }); + expect(result.title).toBeUndefined(); // no title field in array + // expect(result.metadata.description).toBeUndefined(); // no description field in array + // expect(result.metadata.isValidJson).toBe(true); + // expect(result.metadata.jsonStructure).toEqual({ + // type: "array", + // depth: 1, + // itemCount: 3, + // }); }); it("should extract title from JSON properties", async () => { @@ -119,12 +97,13 @@ describe("JsonPipeline", () => { mimeType: "application/json", charset: "utf-8", source: "api.json", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(jsonContent, baseOptions); - expect(result.metadata.title).toBe("My API Documentation"); - expect(result.metadata.description).toBe("REST API for user management"); + expect(result.title).toBe("My API Documentation"); + // expect(result.metadata.description).toBe("REST API for user management"); }); it("should handle nested JSON structures", async () => { @@ -147,15 +126,16 @@ describe("JsonPipeline", () => { mimeType: "application/json", charset: "utf-8", source: "nested.json", + status: FetchStatus.SUCCESS, }; - const result = await pipeline.process(jsonContent, baseOptions); + const _result = await pipeline.process(jsonContent, baseOptions); - expect(result.metadata.jsonStructure).toEqual({ - type: "object", - depth: 4, // user -> profile -> personal -> name/age - propertyCount: 2, // user, settings - }); + // expect(result.metadata.jsonStructure).toEqual({ + // type: "object", + // depth: 4, // user -> profile -> personal -> name/age + // propertyCount: 2, // user, settings + // }); }); it("should handle invalid JSON gracefully", async () => { @@ -164,15 +144,16 @@ describe("JsonPipeline", () => { mimeType: "application/json", charset: "utf-8", source: "invalid.json", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(jsonContent, baseOptions); expect(result.textContent).toBe(jsonContent.content); - expect(result.metadata.title).toBeUndefined(); // no title/description fields for invalid JSON - expect(result.metadata.description).toBeUndefined(); - expect(result.metadata.isValidJson).toBe(false); - expect(result.metadata.jsonStructure).toBeUndefined(); + expect(result.title).toBeUndefined(); // no title/description fields for invalid JSON + // expect(result.metadata.description).toBeUndefined(); + // expect(result.metadata.isValidJson).toBe(false); + // expect(result.metadata.jsonStructure).toBeUndefined(); }); it("should handle JSON primitives", async () => { @@ -181,16 +162,17 @@ describe("JsonPipeline", () => { mimeType: "application/json", charset: "utf-8", source: "string.json", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(stringContent, baseOptions); - expect(result.metadata.title).toBeUndefined(); // no title field in primitive - expect(result.metadata.description).toBeUndefined(); // no description field in primitive - expect(result.metadata.jsonStructure).toEqual({ - type: "string", - depth: 1, - }); + expect(result.title).toBeUndefined(); // no title field in primitive + // expect(result.metadata.description).toBeUndefined(); // no description field in primitive + // expect(result.metadata.jsonStructure).toEqual({ + // type: "string", + // depth: 1, + // }); }); it("should handle empty JSON structures", async () => { @@ -199,17 +181,18 @@ describe("JsonPipeline", () => { mimeType: "application/json", charset: "utf-8", source: "empty.json", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(emptyObjectContent, baseOptions); - expect(result.metadata.title).toBeUndefined(); // no title field in empty object - expect(result.metadata.description).toBeUndefined(); // no description field in empty object - expect(result.metadata.jsonStructure).toEqual({ - type: "object", - depth: 1, - propertyCount: 0, - }); + expect(result.title).toBeUndefined(); // no title field in empty object + // expect(result.metadata.description).toBeUndefined(); // no description field in empty object + // expect(result.metadata.jsonStructure).toEqual({ + // type: "object", + // depth: 1, + // propertyCount: 0, + // }); }); it("should handle Buffer content", async () => { @@ -219,12 +202,13 @@ describe("JsonPipeline", () => { mimeType: "application/json", charset: "utf-8", source: "buffer.json", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(jsonContent, baseOptions); expect(result.textContent).toBe(jsonString); - expect(result.metadata.isValidJson).toBe(true); + // expect(result.metadata.isValidJson).toBe(true); }); }); }); diff --git a/src/scraper/pipelines/JsonPipeline.ts b/src/scraper/pipelines/JsonPipeline.ts index e8c984bd..c3abe6a0 100644 --- a/src/scraper/pipelines/JsonPipeline.ts +++ b/src/scraper/pipelines/JsonPipeline.ts @@ -7,7 +7,7 @@ import type { ContentProcessorMiddleware, MiddlewareContext } from "../middlewar import type { ScraperOptions } from "../types"; import { convertToString } from "../utils/buffer"; import { BasePipeline } from "./BasePipeline"; -import type { ProcessedContent } from "./types"; +import type { PipelineResult } from "./types"; /** * Pipeline for processing JSON content with semantic, hierarchical splitting. @@ -28,16 +28,16 @@ export class JsonPipeline extends BasePipeline { }); } - canProcess(rawContent: RawContent): boolean { - if (!rawContent.mimeType) return false; - return MimeTypeUtils.isJson(rawContent.mimeType); + canProcess(mimeType: string): boolean { + if (!mimeType) return false; + return MimeTypeUtils.isJson(mimeType); } async process( rawContent: RawContent, options: ScraperOptions, fetcher?: ContentFetcher, - ): Promise { + ): Promise { const contentString = convertToString(rawContent.content, rawContent.charset); // Validate JSON structure @@ -55,23 +55,26 @@ export class JsonPipeline extends BasePipeline { const fallbackChunks = await this.splitter.splitText(contentString); return { textContent: contentString, - metadata: { - isValidJson: false, - }, + // metadata: { + // isValidJson: false, + // }, links: [], errors: [], chunks: fallbackChunks, }; } + const metadata = this.extractMetadata(parsedJson); const context: MiddlewareContext = { content: contentString, source: rawContent.source, - metadata: { - ...this.extractMetadata(parsedJson), - isValidJson, - jsonStructure: this.analyzeJsonStructure(parsedJson), - }, + title: metadata.title, + contentType: rawContent.mimeType || "application/json", + // metadata: { + // ...this.extractMetadata(parsedJson), + // isValidJson, + // jsonStructure: this.analyzeJsonStructure(parsedJson), + // }, links: [], // JSON files typically don't contain links errors: [], options, @@ -85,8 +88,8 @@ export class JsonPipeline extends BasePipeline { const chunks = await this.splitter.splitText(context.content); return { + title: context.title, textContent: context.content, - metadata: context.metadata, links: context.links, errors: context.errors, chunks, @@ -124,36 +127,6 @@ export class JsonPipeline extends BasePipeline { return metadata; } - /** - * Analyzes the structure of valid JSON for metadata - */ - private analyzeJsonStructure(parsedJson: unknown): { - type: string; - depth: number; - itemCount?: number; - propertyCount?: number; - } { - if (Array.isArray(parsedJson)) { - return { - type: "array", - depth: this.calculateDepth(parsedJson), - itemCount: parsedJson.length, - }; - } else if (typeof parsedJson === "object" && parsedJson !== null) { - const obj = parsedJson as Record; - return { - type: "object", - depth: this.calculateDepth(parsedJson), - propertyCount: Object.keys(obj).length, - }; - } else { - return { - type: typeof parsedJson, - depth: 1, - }; - } - } - /** * Calculates the maximum nesting depth of a JSON structure */ diff --git a/src/scraper/pipelines/MarkdownPipeline.test.ts b/src/scraper/pipelines/MarkdownPipeline.test.ts index 8f9cb3b2..9192a5ae 100644 --- a/src/scraper/pipelines/MarkdownPipeline.test.ts +++ b/src/scraper/pipelines/MarkdownPipeline.test.ts @@ -1,6 +1,6 @@ // Copyright (c) 2025 import { beforeEach, describe, expect, it, vi } from "vitest"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import { MarkdownLinkExtractorMiddleware } from "../middleware/MarkdownLinkExtractorMiddleware"; import { MarkdownMetadataExtractorMiddleware } from "../middleware/MarkdownMetadataExtractorMiddleware"; import { ScrapeMode, type ScraperOptions } from "../types"; @@ -15,18 +15,15 @@ describe("MarkdownPipeline", () => { it("canProcess returns true for text/markdown", () => { const pipeline = new MarkdownPipeline(); - expect(pipeline.canProcess({ mimeType: "text/markdown" } as RawContent)).toBe(true); - expect(pipeline.canProcess({ mimeType: "text/x-markdown" } as RawContent)).toBe(true); + expect(pipeline.canProcess("text/markdown")).toBe(true); + expect(pipeline.canProcess("text/x-markdown")).toBe(true); }); // MarkdownPipeline now processes all text/* types as markdown, including text/html. it("canProcess returns false for non-text types", () => { const pipeline = new MarkdownPipeline(); - expect(pipeline.canProcess({ mimeType: "application/json" } as RawContent)).toBe( - false, - ); - // @ts-expect-error - expect(pipeline.canProcess({ mimeType: undefined } as RawContent)).toBe(false); + expect(pipeline.canProcess("application/json")).toBe(false); + expect(pipeline.canProcess("")).toBe(false); }); it("process decodes Buffer content with UTF-8 charset", async () => { @@ -36,6 +33,7 @@ describe("MarkdownPipeline", () => { mimeType: "text/markdown", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toBe("# Header\n\nThis is a test."); @@ -67,6 +65,7 @@ describe("MarkdownPipeline", () => { mimeType: "text/markdown", charset: "iso-8859-1", // Explicitly set charset to ISO-8859-1 source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toBe("# Café"); @@ -82,6 +81,7 @@ describe("MarkdownPipeline", () => { mimeType: "text/markdown", // No charset specified source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toBe("# Default UTF-8\n\nContent"); @@ -94,6 +94,7 @@ describe("MarkdownPipeline", () => { mimeType: "text/markdown", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toBe( @@ -118,6 +119,7 @@ This is a paragraph with a [link](https://test.example.com). mimeType: "text/markdown", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); @@ -148,9 +150,10 @@ This is a paragraph with a [link](https://test.example.com). mimeType: "text/markdown", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); - expect(result.errors.some((e) => e.message === "fail")).toBe(true); + expect(result.errors?.some((e) => e.message === "fail")).toBe(true); }); it("process decodes Buffer content with UTF-16LE BOM", async () => { @@ -169,6 +172,7 @@ This is a paragraph with a [link](https://test.example.com). mimeType: "text/markdown", charset: "utf-16le", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("# Café"); @@ -184,6 +188,7 @@ This is a paragraph with a [link](https://test.example.com). mimeType: "text/markdown", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("# Café"); @@ -197,6 +202,7 @@ This is a paragraph with a [link](https://test.example.com). mimeType: "text/markdown", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("こんにちは世界"); @@ -210,6 +216,7 @@ This is a paragraph with a [link](https://test.example.com). mimeType: "text/markdown", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("Привет, мир"); @@ -241,6 +248,7 @@ More content here. mimeType: "text/markdown", charset: "utf-8", source: "http://test.example.com", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, { @@ -288,6 +296,7 @@ Final content in section B.`; mimeType: "text/markdown", charset: "utf-8", source: "http://test.example.com", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, { @@ -298,14 +307,14 @@ Final content in section B.`; }); // Verify we got chunks with proper hierarchy - expect(result.chunks.length).toBeGreaterThan(0); + expect(result.chunks?.length).toBeGreaterThan(0); // GreedySplitter may merge small content into fewer chunks // But the hierarchy structure should still be semantically meaningful - expect(result.chunks.length).toBeGreaterThanOrEqual(1); + expect(result.chunks?.length).toBeGreaterThanOrEqual(1); // Check that all chunks have valid hierarchy metadata - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.section).toBeDefined(); expect(typeof chunk.section.level).toBe("number"); expect(Array.isArray(chunk.section.path)).toBe(true); @@ -313,8 +322,8 @@ Final content in section B.`; }); // Verify that headings and text are properly identified - const hasHeadings = result.chunks.some((chunk) => chunk.types.includes("heading")); - const hasText = result.chunks.some((chunk) => chunk.types.includes("text")); + const hasHeadings = result.chunks?.some((chunk) => chunk.types.includes("heading")); + const hasText = result.chunks?.some((chunk) => chunk.types.includes("text")); expect(hasHeadings || hasText).toBe(true); // Should have semantic content }); @@ -336,6 +345,7 @@ Content under second level.`; mimeType: "text/markdown", charset: "utf-8", source: "http://test.example.com", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, { @@ -346,7 +356,7 @@ Content under second level.`; }); // Should not create separate whitespace-only chunks at level 0 - const whitespaceOnlyChunks = result.chunks.filter( + const whitespaceOnlyChunks = result.chunks?.filter( (chunk) => chunk.section.level === 0 && chunk.section.path.length === 0 && @@ -355,7 +365,7 @@ Content under second level.`; expect(whitespaceOnlyChunks).toHaveLength(0); // First heading should be at level 1, not degraded by whitespace - const firstHeading = result.chunks.find( + const firstHeading = result.chunks?.find( (chunk) => chunk.types.includes("heading") && chunk.content.includes("First Heading"), ); @@ -363,7 +373,7 @@ Content under second level.`; expect(firstHeading!.section.level).toBe(1); // Minimum level should be 1 (not degraded to 0 by GreedySplitter) - const minLevel = Math.min(...result.chunks.map((c) => c.section.level)); + const minLevel = Math.min(...result.chunks!.map((c) => c.section.level)); expect(minLevel).toBe(1); }); @@ -395,6 +405,7 @@ ${longContent}`; mimeType: "text/markdown", charset: "utf-8", source: "http://test.example.com", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, { @@ -405,15 +416,15 @@ ${longContent}`; }); // Should have multiple chunks due to size constraints - expect(result.chunks.length).toBeGreaterThan(1); + expect(result.chunks?.length).toBeGreaterThan(1); // All chunks should be within size limits - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(100); }); // Should maintain hierarchy levels (not degrade to 0) - const minLevel = Math.min(...result.chunks.map((c) => c.section.level)); + const minLevel = Math.min(...result.chunks!.map((c) => c.section.level)); expect(minLevel).toBeGreaterThanOrEqual(1); }); @@ -440,6 +451,7 @@ More details here.`; mimeType: "text/markdown", charset: "utf-8", source: "http://test.example.com", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, { @@ -450,14 +462,14 @@ More details here.`; }); // Verify we have content with semantic types (GreedySplitter may merge them) - expect(result.chunks.length).toBeGreaterThan(0); + expect(result.chunks?.length).toBeGreaterThan(0); // Check that we have the expected content types somewhere in the chunks - const allTypes = new Set(result.chunks.flatMap((chunk) => chunk.types)); + const allTypes = new Set(result.chunks?.flatMap((chunk) => chunk.types)); expect(allTypes.has("heading") || allTypes.has("text")).toBe(true); // Verify all chunks have proper section metadata - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.section).toBeDefined(); expect(typeof chunk.section.level).toBe("number"); expect(Array.isArray(chunk.section.path)).toBe(true); @@ -465,7 +477,7 @@ More details here.`; }); // Verify content is preserved (at least the key parts) - const allContent = result.chunks.map((chunk) => chunk.content).join(""); + const allContent = result.chunks?.map((chunk) => chunk.content).join(""); expect(allContent).toContain("Documentation"); expect(allContent).toContain("Implementation"); expect(allContent).toContain("Hello, world!"); @@ -487,6 +499,7 @@ Final paragraph.`; mimeType: "text/markdown", charset: "utf-8", source: "http://test.example.com", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, { @@ -497,7 +510,7 @@ Final paragraph.`; }); // Verify semantic content is preserved (may not be perfect reconstruction due to whitespace normalization) - const allContent = result.chunks.map((chunk) => chunk.content).join(""); + const allContent = result.chunks?.map((chunk) => chunk.content).join(""); expect(allContent).toContain("# Title"); expect(allContent).toContain("## Subtitle"); expect(allContent).toContain("Paragraph with text"); @@ -506,10 +519,10 @@ Final paragraph.`; expect(allContent).toContain("Final paragraph"); // Verify we have semantic chunks - expect(result.chunks.length).toBeGreaterThan(0); + expect(result.chunks?.length).toBeGreaterThan(0); // Verify hierarchical structure is preserved - const minLevel = Math.min(...result.chunks.map((chunk) => chunk.section.level)); + const minLevel = Math.min(...result.chunks!.map((chunk) => chunk.section.level)); expect(minLevel).toBeGreaterThanOrEqual(1); // Should not degrade to 0 }); }); diff --git a/src/scraper/pipelines/MarkdownPipeline.ts b/src/scraper/pipelines/MarkdownPipeline.ts index d856eed0..e3302392 100644 --- a/src/scraper/pipelines/MarkdownPipeline.ts +++ b/src/scraper/pipelines/MarkdownPipeline.ts @@ -12,7 +12,7 @@ import type { ContentProcessorMiddleware, MiddlewareContext } from "../middlewar import type { ScraperOptions } from "../types"; import { convertToString } from "../utils/buffer"; import { BasePipeline } from "./BasePipeline"; -import type { ProcessedContent } from "./types"; +import type { PipelineResult } from "./types"; /** * Pipeline for processing Markdown content using middleware and semantic splitting with size optimization. @@ -45,22 +45,22 @@ export class MarkdownPipeline extends BasePipeline { ); } - canProcess(rawContent: RawContent): boolean { - if (!rawContent.mimeType) return false; - return MimeTypeUtils.isMarkdown(rawContent.mimeType); + canProcess(mimeType: string): boolean { + if (!mimeType) return false; + return MimeTypeUtils.isMarkdown(mimeType); } async process( rawContent: RawContent, options: ScraperOptions, fetcher?: ContentFetcher, - ): Promise { + ): Promise { const contentString = convertToString(rawContent.content, rawContent.charset); const context: MiddlewareContext = { + contentType: rawContent.mimeType || "text/markdown", content: contentString, source: rawContent.source, - metadata: {}, links: [], errors: [], options, @@ -77,8 +77,8 @@ export class MarkdownPipeline extends BasePipeline { ); return { + title: context.title, textContent: typeof context.content === "string" ? context.content : "", - metadata: context.metadata, links: context.links, errors: context.errors, chunks, diff --git a/src/scraper/pipelines/PipelineFactory.integration.test.ts b/src/scraper/pipelines/PipelineFactory.integration.test.ts index bbd4e2fd..67adb11f 100644 --- a/src/scraper/pipelines/PipelineFactory.integration.test.ts +++ b/src/scraper/pipelines/PipelineFactory.integration.test.ts @@ -1,4 +1,5 @@ import { describe, expect, it } from "vitest"; +import { FetchStatus, type RawContent } from "../fetcher"; import { ScrapeMode } from "../types"; import { type PipelineConfiguration, PipelineFactory } from "./PipelineFactory"; @@ -23,10 +24,11 @@ describe("PipelineFactory Integration", () => { const textPipeline = pipelines[4]; // TextPipeline // Create mock RawContent for the process method - const rawContent = { + const rawContent: RawContent = { source: "test.txt", content: longContent, mimeType: "text/plain", + status: FetchStatus.SUCCESS, }; const scraperOptions = { @@ -42,8 +44,8 @@ describe("PipelineFactory Integration", () => { // Verify that chunks are smaller due to custom configuration // With 570 characters and 100 char preferred size, should be multiple chunks - expect(processed.chunks.length).toBeGreaterThan(1); // Should be split into multiple chunks - processed.chunks.forEach((chunk) => { + expect(processed.chunks?.length).toBeGreaterThan(1); // Should be split into multiple chunks + processed.chunks?.forEach((chunk) => { expect(chunk.content.length).toBeGreaterThan(0); // Should be much smaller than default 1500 expect(chunk.content.length).toBeLessThan(300); @@ -59,10 +61,11 @@ describe("PipelineFactory Integration", () => { // Test with TextPipeline const textPipeline = pipelines[4]; - const rawContent = { + const rawContent: RawContent = { source: "test.txt", content: moderateContent, mimeType: "text/plain", + status: FetchStatus.SUCCESS, }; const scraperOptions = { @@ -77,8 +80,8 @@ describe("PipelineFactory Integration", () => { const processed = await textPipeline.process(rawContent, scraperOptions); // With default chunk size (1500), this should fit in one chunk - expect(processed.chunks.length).toBe(1); - expect(processed.chunks[0].content.length).toBeLessThan(300); + expect(processed.chunks?.length).toBe(1); + expect(processed.chunks?.[0].content?.length).toBeLessThan(300); }); it("should handle different pipeline types with custom configuration", async () => { @@ -95,10 +98,11 @@ describe("PipelineFactory Integration", () => { const testContent = "This is a test content that might be split. ".repeat(10); // ~450 characters for (const pipeline of pipelines) { - const rawContent = { + const rawContent: RawContent = { source: "test.txt", content: testContent, mimeType: "text/plain", + status: FetchStatus.SUCCESS, }; const scraperOptions = { @@ -111,10 +115,10 @@ describe("PipelineFactory Integration", () => { }; const processed = await pipeline.process(rawContent, scraperOptions); - expect(processed.chunks.length).toBeGreaterThanOrEqual(1); + expect(processed.chunks?.length).toBeGreaterThanOrEqual(1); // Verify each chunk respects the configuration - processed.chunks.forEach((chunk) => { + processed.chunks?.forEach((chunk) => { expect(chunk.content.length).toBeGreaterThan(0); // Allow some flexibility for splitting logic, but ensure it's not wildly large expect(chunk.content.length).toBeLessThan(800); @@ -140,15 +144,17 @@ describe("PipelineFactory Integration", () => { chunkSizes: { preferred: 80, max: 150 }, }); - const rawContent = { + const rawContent: RawContent = { source: "test", content, mimeType, + status: FetchStatus.SUCCESS, }; // Find the first pipeline that can process this content + const contentBuffer = Buffer.from(content); for (const pipeline of pipelines) { - if (pipeline.canProcess(rawContent)) { + if (pipeline.canProcess(mimeType, contentBuffer)) { return await pipeline.process(rawContent, baseOptions); } } @@ -171,19 +177,17 @@ describe("PipelineFactory Integration", () => { const result = await processContent(htmlContent, "text/html"); // HTML should be converted to markdown and create hierarchical structure - expect(result.chunks.length).toBeGreaterThan(1); + expect(result.chunks?.length).toBeGreaterThan(1); // Should have chunks with heading-based hierarchy - const headingChunks = result.chunks.filter( + const headingChunks = result.chunks?.filter( (chunk) => chunk.types.includes("heading") || chunk.section.path.length > 0, ); - expect(headingChunks.length).toBeGreaterThan(0); + expect(headingChunks?.length).toBeGreaterThan(0); // Should convert table to markdown format - const tableChunks = result.chunks.filter((chunk) => chunk.types.includes("table")); - if (tableChunks.length > 0) { - expect(tableChunks[0].content).toMatch(/\|.*\|/); // Markdown table format - } + const tableChunks = result.chunks?.filter((chunk) => chunk.types.includes("table")); + expect(tableChunks?.[0].content).toMatch(/\|.*\|/); // Markdown table format }); it("should process JavaScript/TypeScript with semantic code boundaries", async () => { @@ -209,18 +213,18 @@ describe("PipelineFactory Integration", () => { const result = await processContent(jsContent, "application/javascript"); // Should split along semantic boundaries (functions, classes) - expect(result.chunks.length).toBeGreaterThan(1); + expect(result.chunks?.length).toBeGreaterThan(1); // Should preserve code structure and formatting - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.types).toContain("code"); // All chunks should have content (including whitespace for perfect reconstruction) expect(chunk.content.length).toBeGreaterThan(0); }); // Should maintain perfect reconstruction - const reconstructed = result.chunks.map((chunk) => chunk.content).join(""); - expect(reconstructed.trim()).toBe(jsContent.trim()); + const reconstructed = result.chunks?.map((chunk) => chunk.content).join(""); + expect(reconstructed?.trim()).toBe(jsContent.trim()); expect(reconstructed).toContain("add(a, b)"); expect(reconstructed).toContain("multiply(a, b)"); expect(reconstructed).toContain('greet("World")'); @@ -256,14 +260,14 @@ describe("PipelineFactory Integration", () => { const result = await processContent(jsonContent, "application/json"); // Should handle JSON structure appropriately - expect(result.chunks.length).toBeGreaterThanOrEqual(1); + expect(result.chunks?.length).toBeGreaterThanOrEqual(1); // Should preserve JSON formatting and structure - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.content.trim()).not.toBe(""); // JSON chunks should be valid when reconstructed - const reconstructed = result.chunks.map((c) => c.content).join(""); - expect(() => JSON.parse(reconstructed)).not.toThrow(); + const reconstructed = result.chunks?.map((c) => c.content).join(""); + expect(() => JSON.parse(reconstructed || "")).not.toThrow(); }); }); @@ -298,24 +302,24 @@ More detailed content here. const result = await processContent(markdownContent, "text/markdown"); // Should create multiple chunks with different content types - expect(result.chunks.length).toBeGreaterThan(3); + expect(result.chunks?.length).toBeGreaterThan(3); // Should distinguish between content types - const contentTypes = new Set(result.chunks.flatMap((chunk) => chunk.types)); + const contentTypes = new Set(result.chunks?.flatMap((chunk) => chunk.types)); expect(contentTypes.size).toBeGreaterThan(1); // Should have multiple content types // Should create hierarchical paths based on headings - const hierarchicalChunks = result.chunks.filter( + const hierarchicalChunks = result.chunks?.filter( (chunk) => chunk.section.path.length > 0, ); - expect(hierarchicalChunks.length).toBeGreaterThan(0); + expect(hierarchicalChunks?.length).toBeGreaterThan(0); // Should preserve markdown structure - const codeChunks = result.chunks.filter((chunk) => chunk.types.includes("code")); - const tableChunks = result.chunks.filter((chunk) => chunk.types.includes("table")); + const codeChunks = result.chunks?.filter((chunk) => chunk.types.includes("code")); + const tableChunks = result.chunks?.filter((chunk) => chunk.types.includes("table")); - expect(codeChunks.length).toBeGreaterThan(0); - expect(tableChunks.length).toBeGreaterThan(0); + expect(codeChunks?.length).toBeGreaterThan(0); + expect(tableChunks?.length).toBeGreaterThan(0); }); it("should process plain text with simple structure and no hierarchy", async () => { @@ -332,18 +336,18 @@ Final paragraph here. const result = await processContent(textContent, "text/plain"); // Should split into chunks but maintain simplicity - expect(result.chunks.length).toBeGreaterThanOrEqual(1); + expect(result.chunks?.length).toBeGreaterThanOrEqual(1); // All chunks should be text type with no hierarchy - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.types).toEqual(["text"]); expect(chunk.section.path).toEqual([]); // No hierarchical structure expect(chunk.section.level).toBe(0); }); // Should preserve content exactly - const reconstructed = result.chunks.map((chunk) => chunk.content).join(""); - expect(reconstructed.trim()).toBe(textContent.trim()); + const reconstructed = result.chunks?.map((chunk) => chunk.content).join(""); + expect(reconstructed?.trim()).toBe(textContent.trim()); }); }); @@ -377,29 +381,33 @@ Content for section one that is longer than the chunk size limit. More content for section two that also exceeds the small limit. `; - const rawContent = { + const rawContent: RawContent = { source: "test.md", content: markdownContent, mimeType: "text/markdown", + status: FetchStatus.SUCCESS, }; // Find markdown pipeline - const markdownPipeline = pipelines.find((p) => p.canProcess(rawContent)); + const contentBuffer = Buffer.from(markdownContent); + const markdownPipeline = pipelines.find((p) => + p.canProcess(rawContent.mimeType, contentBuffer), + ); expect(markdownPipeline).toBeDefined(); const result = await markdownPipeline!.process(rawContent, baseOptions); // Even with small chunk size, should maintain semantic structure - const headingChunks = result.chunks.filter((chunk) => + const headingChunks = result.chunks?.filter((chunk) => chunk.types.includes("heading"), ); - expect(headingChunks.length).toBeGreaterThan(0); + expect(headingChunks?.length).toBeGreaterThan(0); // Should still create proper hierarchy despite size constraints - const hierarchicalChunks = result.chunks.filter( + const hierarchicalChunks = result.chunks?.filter( (chunk) => chunk.section.path.length > 0, ); - expect(hierarchicalChunks.length).toBeGreaterThan(0); + expect(hierarchicalChunks?.length).toBeGreaterThan(0); }); it("should preserve logical units in code even with large chunk sizes", async () => { @@ -423,23 +431,27 @@ class MyClass { } `; - const rawContent = { + const rawContent: RawContent = { source: "test.js", content: codeContent, mimeType: "application/javascript", + status: FetchStatus.SUCCESS, }; - const codePipeline = pipelines.find((p) => p.canProcess(rawContent)); + const contentBuffer = Buffer.from(codeContent); + const codePipeline = pipelines.find((p) => + p.canProcess(rawContent.mimeType, contentBuffer), + ); expect(codePipeline).toBeDefined(); const result = await codePipeline!.process(rawContent, baseOptions); // Even with large chunk size allowing everything in one chunk, // should still respect logical code boundaries - expect(result.chunks.length).toBeGreaterThanOrEqual(1); + expect(result.chunks?.length).toBeGreaterThanOrEqual(1); // Should maintain code structure - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.types).toContain("code"); expect(chunk.content.trim()).not.toBe(""); }); @@ -468,19 +480,23 @@ class MyClass { ]; for (const testCase of testCases) { - const rawContent = { + const rawContent: RawContent = { source: "test", content: testCase.content, mimeType: testCase.mimeType, + status: FetchStatus.SUCCESS, }; - const pipeline = pipelines.find((p) => p.canProcess(rawContent)); + const contentBuffer = Buffer.from(testCase.content); + const pipeline = pipelines.find((p) => + p.canProcess(rawContent.mimeType, contentBuffer), + ); expect(pipeline).toBeDefined(); const result = await pipeline!.process(rawContent, baseOptions); // All should respect the size constraints - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(250); // Small buffer for edge cases }); } @@ -507,32 +523,39 @@ class MyClass { }; // No pipeline should accept unknown MIME types - const acceptingPipeline = pipelines.find((p) => p.canProcess(unknownContent)); + const contentBuffer = Buffer.from(unknownContent.content); + const acceptingPipeline = pipelines.find((p) => + p.canProcess(unknownContent.mimeType, contentBuffer), + ); expect(acceptingPipeline).toBeUndefined(); // Verify that each pipeline explicitly rejects it pipelines.forEach((pipeline) => { - expect(pipeline.canProcess(unknownContent)).toBe(false); + expect(pipeline.canProcess(unknownContent.mimeType, contentBuffer)).toBe(false); }); }); it("should handle invalid JSON as text content", async () => { const pipelines = PipelineFactory.createStandardPipelines(); - const invalidJsonContent = { + const invalidJsonContent: RawContent = { source: "test.json", content: '{"invalid": json, missing quotes}', mimeType: "application/json", + status: FetchStatus.SUCCESS, }; - const jsonPipeline = pipelines.find((p) => p.canProcess(invalidJsonContent)); + const contentBuffer = Buffer.from(invalidJsonContent.content); + const jsonPipeline = pipelines.find((p) => + p.canProcess(invalidJsonContent.mimeType, contentBuffer), + ); expect(jsonPipeline).toBeDefined(); const result = await jsonPipeline!.process(invalidJsonContent, baseOptions); // Should handle gracefully and process as text-like content - expect(result.chunks.length).toBeGreaterThanOrEqual(1); - expect(result.metadata.isValidJson).toBe(false); + expect(result.chunks?.length).toBeGreaterThanOrEqual(1); + // expect(result.metadata.isValidJson).toBe(false); }); it("should maintain content integrity across different processing paths", async () => { @@ -546,24 +569,28 @@ class MyClass { ]; for (const testCase of testCases) { - const rawContent = { + const rawContent: RawContent = { source: "test", content: testCase.content, mimeType: testCase.mimeType, + status: FetchStatus.SUCCESS, }; - const pipeline = pipelines.find((p) => p.canProcess(rawContent)); + const contentBuffer = Buffer.from(testCase.content); + const pipeline = pipelines.find((p) => + p.canProcess(rawContent.mimeType, contentBuffer), + ); expect(pipeline).toBeDefined(); const result = await pipeline!.process(rawContent, baseOptions); // Content should be preserved (allowing for format conversion) - expect(result.textContent.trim()).not.toBe(""); - expect(result.chunks.length).toBeGreaterThan(0); + expect(result.textContent?.trim()).not.toBe(""); + expect(result.chunks?.length).toBeGreaterThan(0); // Should be able to reconstruct meaningful content const reconstructed = result.chunks - .map((chunk) => chunk.content) + ?.map((chunk) => chunk.content) .join("") .trim(); expect(reconstructed).not.toBe(""); diff --git a/src/scraper/pipelines/SourceCodePipeline.test.ts b/src/scraper/pipelines/SourceCodePipeline.test.ts index 31def2d3..32f1969a 100644 --- a/src/scraper/pipelines/SourceCodePipeline.test.ts +++ b/src/scraper/pipelines/SourceCodePipeline.test.ts @@ -1,5 +1,5 @@ import { beforeEach, describe, expect, it } from "vitest"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import type { ScraperOptions } from "../types"; import { ScrapeMode } from "../types"; import { SourceCodePipeline } from "./SourceCodePipeline"; @@ -32,44 +32,17 @@ describe("SourceCodePipeline", () => { describe("canProcess", () => { it("should accept JavaScript content types", () => { - const jsContent: RawContent = { - content: "function test() {}", - mimeType: "text/javascript", - source: "test.js", - }; - expect(pipeline.canProcess(jsContent)).toBe(true); - - const appJsContent: RawContent = { - content: "const x = 1;", - mimeType: "application/javascript", - source: "test.js", - }; - expect(pipeline.canProcess(appJsContent)).toBe(true); + expect(pipeline.canProcess("text/javascript")).toBe(true); + expect(pipeline.canProcess("application/javascript")).toBe(true); }); it("should accept TypeScript content types", () => { - const tsContent: RawContent = { - content: "interface Test { x: number; }", - mimeType: "text/x-typescript", - source: "test.ts", - }; - expect(pipeline.canProcess(tsContent)).toBe(true); - - const tsxContent: RawContent = { - content: "const Component = () =>
Test
;", - mimeType: "text/x-tsx", - source: "test.tsx", - }; - expect(pipeline.canProcess(tsxContent)).toBe(true); + expect(pipeline.canProcess("text/x-typescript")).toBe(true); + expect(pipeline.canProcess("text/x-tsx")).toBe(true); }); it("should accept JSX content types", () => { - const jsxContent: RawContent = { - content: "const Component = () =>
Test
;", - mimeType: "text/x-jsx", - source: "test.jsx", - }; - expect(pipeline.canProcess(jsxContent)).toBe(true); + expect(pipeline.canProcess("text/x-jsx")).toBe(true); }); it("should reject non-source code content types", () => { @@ -85,22 +58,13 @@ describe("SourceCodePipeline", () => { ]; for (const mimeType of nonCodeTypes) { - const content: RawContent = { - content: "some content", - mimeType, - source: "test.file", - }; - expect(pipeline.canProcess(content)).toBe(false); + expect(pipeline.canProcess(mimeType)).toBe(false); } }); it("should reject content without mime type", () => { - const content: RawContent = { - content: "function test() {}", - mimeType: undefined as any, - source: "test.js", - }; - expect(pipeline.canProcess(content)).toBe(false); + expect(pipeline.canProcess("")).toBe(false); + expect(pipeline.canProcess(undefined as any)).toBe(false); }); }); @@ -112,21 +76,22 @@ describe("SourceCodePipeline", () => { }`, mimeType: "text/javascript", source: "test.js", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(jsContent, baseOptions); expect(result.textContent).toBe(jsContent.content); - expect(result.metadata.language).toBe("javascript"); - expect(result.metadata.isSourceCode).toBe(true); + // expect(result.metadata.language).toBe("javascript"); + // expect(result.metadata.isSourceCode).toBe(true); expect(result.links).toEqual([]); expect(result.errors).toEqual([]); expect(result.chunks).toBeDefined(); expect(Array.isArray(result.chunks)).toBe(true); - expect(result.chunks.length).toBeGreaterThan(0); + expect(result.chunks?.length).toBeGreaterThan(0); // All chunks should be marked as code - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.types).toContain("code"); }); }); @@ -145,17 +110,18 @@ class UserService { }`, mimeType: "text/x-typescript", source: "user.ts", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(tsContent, baseOptions); expect(result.textContent).toBe(tsContent.content); - expect(result.metadata.language).toBe("typescript"); - expect(result.metadata.isSourceCode).toBe(true); - expect(result.chunks.length).toBeGreaterThan(0); + // expect(result.metadata.language).toBe("typescript"); + // expect(result.metadata.isSourceCode).toBe(true); + expect(result.chunks?.length).toBeGreaterThan(0); // Should have at least one chunk with method-level hierarchy - const methodChunk = result.chunks.find( + const methodChunk = result.chunks?.find( (chunk) => chunk.section.path.includes("getUser") || chunk.section.path.includes("UserService"), @@ -170,24 +136,19 @@ class UserService { mimeType: "text/javascript", charset: "utf-8", source: "test.js", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(bufferContent, baseOptions); expect(result.textContent).toBe(codeString); - expect(result.metadata.language).toBe("javascript"); - expect(result.metadata.isSourceCode).toBe(true); + // expect(result.metadata.language).toBe("javascript"); + // expect(result.metadata.isSourceCode).toBe(true); }); it("should reject unknown programming language", async () => { - const unknownContent: RawContent = { - content: "some code in unknown language", - mimeType: "text/x-unknown", - source: "test.unknown", - }; - // Unknown MIME type should be rejected by canProcess - expect(pipeline.canProcess(unknownContent)).toBe(false); + expect(pipeline.canProcess("text/x-unknown")).toBe(false); }); }); @@ -217,15 +178,16 @@ class UserRepository implements Repository { content: tsCode, mimeType: "text/x-typescript", source: "user-repository.ts", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(tsContent, baseOptions); - expect(result.metadata.language).toBe("typescript"); - expect(result.chunks.length).toBeGreaterThan(0); + // expect(result.metadata.language).toBe("typescript"); + expect(result.chunks?.length).toBeGreaterThan(0); // Should preserve TypeScript structure - const hasUserRepositoryContent = result.chunks.some((chunk) => + const hasUserRepositoryContent = result.chunks?.some((chunk) => chunk.section.path.includes("UserRepository"), ); expect(hasUserRepositoryContent).toBe(true); @@ -277,15 +239,16 @@ export default ApiClient;`; content: jsCode, mimeType: "text/javascript", source: "api-client.js", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(jsContent, baseOptions); - expect(result.metadata.language).toBe("javascript"); - expect(result.chunks.length).toBeGreaterThan(0); + // expect(result.metadata.language).toBe("javascript"); + expect(result.chunks?.length).toBeGreaterThan(0); // Should preserve JavaScript structure - const hasApiClientContent = result.chunks.some((chunk) => + const hasApiClientContent = result.chunks?.some((chunk) => chunk.section.path.includes("ApiClient"), ); expect(hasApiClientContent).toBe(true); diff --git a/src/scraper/pipelines/SourceCodePipeline.ts b/src/scraper/pipelines/SourceCodePipeline.ts index fc5381ab..8bf892b4 100644 --- a/src/scraper/pipelines/SourceCodePipeline.ts +++ b/src/scraper/pipelines/SourceCodePipeline.ts @@ -7,7 +7,7 @@ import type { ContentProcessorMiddleware, MiddlewareContext } from "../middlewar import type { ScraperOptions } from "../types"; import { convertToString } from "../utils/buffer"; import { BasePipeline } from "./BasePipeline"; -import type { ProcessedContent } from "./types"; +import type { PipelineResult } from "./types"; /** * Pipeline for processing source code content with semantic, structure-aware splitting. @@ -28,27 +28,28 @@ export class SourceCodePipeline extends BasePipeline { this.splitter = new TreesitterSourceCodeSplitter({ maxChunkSize: chunkSize }); } - canProcess(rawContent: RawContent): boolean { - if (!rawContent.mimeType) return false; - return MimeTypeUtils.isSourceCode(rawContent.mimeType); + canProcess(mimeType: string): boolean { + if (!mimeType) return false; + return MimeTypeUtils.isSourceCode(mimeType); } async process( rawContent: RawContent, options: ScraperOptions, fetcher?: ContentFetcher, - ): Promise { + ): Promise { const contentString = convertToString(rawContent.content, rawContent.charset); const context: MiddlewareContext = { + contentType: rawContent.mimeType || "text/plain", content: contentString, source: rawContent.source, - metadata: { - language: rawContent.mimeType - ? MimeTypeUtils.extractLanguageFromMimeType(rawContent.mimeType) - : "text", - isSourceCode: true, - }, + // metadata: { + // language: rawContent.mimeType + // ? MimeTypeUtils.extractLanguageFromMimeType(rawContent.mimeType) + // : "text", + // isSourceCode: true, + // }, links: [], // Source code files typically don't contain web links errors: [], options, @@ -62,8 +63,9 @@ export class SourceCodePipeline extends BasePipeline { const chunks = await this.splitter.splitText(context.content, rawContent.mimeType); return { + title: context.title, textContent: context.content, - metadata: context.metadata, + // metadata: context.metadata, links: context.links, errors: context.errors, chunks, diff --git a/src/scraper/pipelines/TextPipeline.test.ts b/src/scraper/pipelines/TextPipeline.test.ts index f9e4ab1a..d5b4a096 100644 --- a/src/scraper/pipelines/TextPipeline.test.ts +++ b/src/scraper/pipelines/TextPipeline.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from "vitest"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import type { ScraperOptions } from "../types"; import { ScrapeMode } from "../types"; import { TextPipeline } from "./TextPipeline"; @@ -17,99 +17,33 @@ describe("TextPipeline", () => { describe("canProcess", () => { it("should accept text content types", () => { - const textCases: RawContent[] = [ - { - content: "plain text", - mimeType: "text/plain", - source: "test.txt", - }, - { - content: "markdown content", - mimeType: "text/markdown", - source: "test.md", - }, - { - content: "CSS content", - mimeType: "text/css", - source: "test.css", - }, - ]; - - for (const testCase of textCases) { - expect(pipeline.canProcess(testCase)).toBe(true); - } + expect(pipeline.canProcess("text/plain")).toBe(true); + expect(pipeline.canProcess("text/markdown")).toBe(true); + expect(pipeline.canProcess("text/css")).toBe(true); }); it("should accept safe application types", () => { - const safeCases: RawContent[] = [ - { - content: '', - mimeType: "application/xml", - source: "test.xml", - }, - { - content: "console.log('hello')", - mimeType: "application/javascript", - source: "test.js", - }, - { - content: "name: value", - mimeType: "application/yaml", - source: "test.yaml", - }, - ]; - - for (const testCase of safeCases) { - expect(pipeline.canProcess(testCase)).toBe(true); - } + expect(pipeline.canProcess("application/xml")).toBe(true); + expect(pipeline.canProcess("application/javascript")).toBe(true); + expect(pipeline.canProcess("application/yaml")).toBe(true); }); it("should reject binary content", () => { - const binaryCases: RawContent[] = [ - { - content: Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]), // PNG header - mimeType: "image/png", - source: "test.png", - }, - { - content: "text with null byte\0here", - mimeType: "application/octet-stream", - source: "test.bin", - }, - ]; - - for (const testCase of binaryCases) { - expect(pipeline.canProcess(testCase)).toBe(false); - } + const pngBuffer = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]); // PNG header + expect(pipeline.canProcess("image/png", pngBuffer)).toBe(false); + + const binaryContent = Buffer.from("text with null byte\0here"); + expect(pipeline.canProcess("application/octet-stream", binaryContent)).toBe(false); }); it("should reject unknown application types", () => { - const unknownCases: RawContent[] = [ - { - content: "unknown content", - mimeType: "application/unknown", - source: "test.unknown", - }, - { - content: "video data", - mimeType: "video/mp4", - source: "test.mp4", - }, - ]; - - for (const testCase of unknownCases) { - expect(pipeline.canProcess(testCase)).toBe(false); - } + expect(pipeline.canProcess("application/unknown")).toBe(false); + expect(pipeline.canProcess("video/mp4")).toBe(false); }); it("should reject content without mime type", () => { - const noMimeCase: RawContent = { - content: "content without mime type", - mimeType: undefined as any, - source: "test", - }; - - expect(pipeline.canProcess(noMimeCase)).toBe(false); + expect(pipeline.canProcess("")).toBe(false); + expect(pipeline.canProcess(undefined as any)).toBe(false); }); }); @@ -119,13 +53,14 @@ describe("TextPipeline", () => { content: "This is a simple text document with some content.", mimeType: "text/plain", source: "test.txt", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(textContent, baseOptions); expect(result.textContent).toBe(textContent.content); - expect(result.metadata.contentType).toBe("text/plain"); - expect(result.metadata.isGenericText).toBe(true); + // expect(result.contentType).toBe("text/plain"); + // expect(result.metadata.isGenericText).toBe(true); expect(result.links).toEqual([]); expect(result.errors).toEqual([]); expect(result.chunks).toBeDefined(); @@ -137,13 +72,14 @@ describe("TextPipeline", () => { content: "Some unknown format content", mimeType: "application/unknown", source: "test.unknown", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(unknownContent, baseOptions); expect(result.textContent).toBe(unknownContent.content); - expect(result.metadata.contentType).toBe("application/unknown"); - expect(result.metadata.isGenericText).toBe(true); + // expect(result.contentType).toBe("application/unknown"); + // expect(result.metadata.isGenericText).toBe(true); }); it("should handle content without specific mime type", async () => { @@ -151,13 +87,14 @@ describe("TextPipeline", () => { content: "Generic content", mimeType: "text/plain", source: "test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(genericContent, baseOptions); expect(result.textContent).toBe(genericContent.content); - expect(result.metadata.contentType).toBe("text/plain"); - expect(result.metadata.isGenericText).toBe(true); + // expect(result.contentType).toBe("text/plain"); + // expect(result.metadata.isGenericText).toBe(true); }); it("should handle Buffer content", async () => { @@ -166,12 +103,13 @@ describe("TextPipeline", () => { mimeType: "text/plain", charset: "utf-8", source: "test.txt", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(bufferContent, baseOptions); expect(result.textContent).toBe("Buffer content"); - expect(result.metadata.contentType).toBe("text/plain"); + // expect(result.contentType).toBe("text/plain"); }); }); }); diff --git a/src/scraper/pipelines/TextPipeline.ts b/src/scraper/pipelines/TextPipeline.ts index 3691fafd..9591f41a 100644 --- a/src/scraper/pipelines/TextPipeline.ts +++ b/src/scraper/pipelines/TextPipeline.ts @@ -10,7 +10,7 @@ import type { ContentProcessorMiddleware, MiddlewareContext } from "../middlewar import type { ScraperOptions } from "../types"; import { convertToString } from "../utils/buffer"; import { BasePipeline } from "./BasePipeline"; -import type { ProcessedContent } from "./types"; +import type { PipelineResult } from "./types"; /** * Fallback pipeline for processing text content with basic splitting and size optimization. @@ -32,16 +32,16 @@ export class TextPipeline extends BasePipeline { this.splitter = new GreedySplitter(textSplitter, SPLITTER_MIN_CHUNK_SIZE, chunkSize); } - canProcess(rawContent: RawContent): boolean { + canProcess(mimeType: string, content?: string | Buffer): boolean { // This pipeline serves as a fallback for text content, but should not process binary files // First check: MIME type filtering - use utility method for safe types - if (!MimeTypeUtils.isSafeForTextProcessing(rawContent.mimeType)) { + if (!MimeTypeUtils.isSafeForTextProcessing(mimeType)) { return false; } - // Second check: binary detection via null bytes - if (MimeTypeUtils.isBinary(rawContent.content)) { + // Second check: binary detection via null bytes (if content is provided) + if (content && MimeTypeUtils.isBinary(content)) { return false; } @@ -53,16 +53,14 @@ export class TextPipeline extends BasePipeline { rawContent: RawContent, options: ScraperOptions, fetcher?: ContentFetcher, - ): Promise { + ): Promise { const contentString = convertToString(rawContent.content, rawContent.charset); const context: MiddlewareContext = { + title: "", // Title extraction can be added in middleware if needed + contentType: rawContent.mimeType || "text/plain", content: contentString, source: rawContent.source, - metadata: { - contentType: rawContent.mimeType || "text/plain", - isGenericText: true, - }, links: [], // Generic text content typically doesn't contain structured links errors: [], options, @@ -76,8 +74,8 @@ export class TextPipeline extends BasePipeline { const chunks = await this.splitter.splitText(context.content, rawContent.mimeType); return { + title: context.title, textContent: context.content, - metadata: context.metadata, links: context.links, errors: context.errors, chunks, diff --git a/src/scraper/pipelines/types.ts b/src/scraper/pipelines/types.ts index ff2f29bf..39f67a90 100644 --- a/src/scraper/pipelines/types.ts +++ b/src/scraper/pipelines/types.ts @@ -1,21 +1,21 @@ -import type { ContentChunk } from "../../splitter/types"; +import type { Chunk } from "../../splitter/types"; import type { ContentFetcher, RawContent } from "../fetcher/types"; import type { ScraperOptions } from "../types"; /** * Represents the successfully processed content from a pipeline. */ -export interface ProcessedContent { +export interface PipelineResult { + /** The title of the page or document, extracted during processing */ + title?: string | null; /** The final processed content, typically as a string (e.g., Markdown). */ - textContent: string; - /** Extracted metadata (e.g., title, description). */ - metadata: Record; + textContent?: string | null; /** Extracted links from the content. */ - links: string[]; + links?: string[]; /** Any non-critical errors encountered during processing. */ - errors: Error[]; + errors?: Error[]; /** Pre-split chunks from pipeline processing */ - chunks: ContentChunk[]; + chunks?: Chunk[]; } /** @@ -25,11 +25,12 @@ export interface ProcessedContent { */ export interface ContentPipeline { /** - * Determines if this pipeline can process the given raw content. - * @param rawContent The raw content fetched from a source. + * Determines if this pipeline can process content with the given MIME type. + * @param mimeType The MIME type of the content. + * @param content Optional content (string or Buffer) for binary detection (used by TextPipeline). * @returns True if the pipeline can process the content, false otherwise. */ - canProcess(rawContent: RawContent): boolean; + canProcess(mimeType: string, content?: string | Buffer): boolean; /** * Processes the raw content and optionally splits it into chunks. @@ -42,7 +43,7 @@ export interface ContentPipeline { rawContent: RawContent, options: ScraperOptions, fetcher?: ContentFetcher, - ): Promise; + ): Promise; /** * Cleanup resources used by this pipeline (e.g., browser instances, database connections). diff --git a/src/scraper/strategies/BaseScraperStrategy.test.ts b/src/scraper/strategies/BaseScraperStrategy.test.ts index bc282333..628b303c 100644 --- a/src/scraper/strategies/BaseScraperStrategy.test.ts +++ b/src/scraper/strategies/BaseScraperStrategy.test.ts @@ -1,6 +1,8 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; -import type { ScraperOptions } from "../types"; -import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy"; +import type { ProgressCallback } from "../../types"; +import { FetchStatus } from "../fetcher/types"; +import type { QueueItem, ScraperOptions, ScraperProgressEvent } from "../types"; +import { BaseScraperStrategy } from "./BaseScraperStrategy"; // Mock logger vi.mock("../../utils/logger"); @@ -34,11 +36,18 @@ describe("BaseScraperStrategy", () => { maxPages: 1, maxDepth: 1, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); strategy.processItem.mockResolvedValue({ - document: { content: "test", metadata: {} }, + content: { + textContent: "test", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [], + status: FetchStatus.SUCCESS, }); await strategy.scrape(options, progressCallback); @@ -51,8 +60,19 @@ describe("BaseScraperStrategy", () => { currentUrl: "https://example.com/", depth: 0, maxDepth: 1, - document: { content: "test", metadata: {} }, - }); + pageId: undefined, + result: { + url: "https://example.com/", + title: "", + contentType: "", + textContent: "test", + etag: null, + lastModified: null, + links: [], + errors: [], + chunks: [], + }, + } satisfies ScraperProgressEvent); }); it("should respect maxPages", async () => { @@ -64,11 +84,18 @@ describe("BaseScraperStrategy", () => { maxDepth: 1, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); strategy.processItem.mockResolvedValue({ - document: { content: "test", metadata: {} }, + content: { + textContent: "test", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: ["https://example.com/page2", "https://example.com/page3"], + status: FetchStatus.SUCCESS, }); await strategy.scrape(options, progressCallback); @@ -84,7 +111,7 @@ describe("BaseScraperStrategy", () => { maxDepth: 1, ignoreErrors: true, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const error = new Error("Test error"); strategy.processItem.mockRejectedValue(error); @@ -104,7 +131,7 @@ describe("BaseScraperStrategy", () => { maxDepth: 1, ignoreErrors: false, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const error = new Error("Test error"); strategy.processItem.mockRejectedValue(error); @@ -125,24 +152,38 @@ describe("BaseScraperStrategy", () => { maxPages: 5, maxDepth: 2, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); // Return the same URLs multiple times to simulate duplicate links strategy.processItem.mockImplementation(async (item: QueueItem) => { if (item.url === "https://example.com/") { return { - document: { content: "main page", metadata: {} }, + content: { + textContent: "main page", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [ "https://example.com/page1", "https://example.com/page1", // Duplicate "https://example.com/page2", "https://example.com/page2/", // Duplicate with trailing slash ], + status: FetchStatus.SUCCESS, }; } return { - document: { content: "sub page", metadata: {} }, + content: { + textContent: "sub page", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [], + status: FetchStatus.SUCCESS, }; }); @@ -170,7 +211,7 @@ describe("BaseScraperStrategy", () => { maxPages: 10, maxDepth: 2, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); // First page returns variations of the same URL let firstPageCalled = false; @@ -178,7 +219,13 @@ describe("BaseScraperStrategy", () => { if (item.url === "https://example.com/") { firstPageCalled = true; return { - document: { content: "main page", metadata: {} }, + content: { + textContent: "main page", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [ "https://example.com/path/", "https://example.com/path", // Without trailing slash @@ -186,11 +233,19 @@ describe("BaseScraperStrategy", () => { "https://example.com/path?q=1#anchor", // With anchor "https://example.com/path", // Different case ], + status: FetchStatus.SUCCESS, }; } return { - document: { content: "sub page", metadata: {} }, + content: { + textContent: "sub page", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [], + status: FetchStatus.SUCCESS, }; }); @@ -219,7 +274,7 @@ describe("BaseScraperStrategy", () => { maxDepth: 3, maxConcurrency: 3, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); // Simulate the link structure and timing strategy.processItem.mockImplementation(async (item: QueueItem) => { @@ -251,8 +306,15 @@ describe("BaseScraperStrategy", () => { } // X has no links return { - document: { content: `Content for ${url}`, metadata: {} }, + content: { + textContent: `Content for ${url}`, + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links, + status: FetchStatus.SUCCESS, }; }); @@ -297,19 +359,36 @@ describe("BaseScraperStrategy", () => { maxDepth: 1, includePatterns: ["docs/*"], }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); strategy.processItem.mockImplementation(async (item: QueueItem) => { if (item.url === "https://example.com/docs/start") { return { - document: { content: "main", metadata: {} }, + content: { + textContent: "main", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [ "https://example.com/docs/intro", "https://example.com/docs/other", "https://example.com/api/should-not-include", ], + status: FetchStatus.SUCCESS, }; } - return { document: { content: "sub", metadata: {} }, links: [] }; + return { + content: { + textContent: "sub", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; }); await strategy.scrape(options, progressCallback); const processedUrls = strategy.processItem.mock.calls.map((call) => call[0].url); @@ -328,19 +407,36 @@ describe("BaseScraperStrategy", () => { maxDepth: 1, includePatterns: ["/docs\\/intro.*/"], }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); strategy.processItem.mockImplementation(async (item: QueueItem) => { if (item.url === "https://example.com/docs/start") { return { - document: { content: "main", metadata: {} }, + content: { + textContent: "main", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [ "https://example.com/docs/intro", "https://example.com/docs/intro2", "https://example.com/docs/other", ], + status: FetchStatus.SUCCESS, }; } - return { document: { content: "sub", metadata: {} }, links: [] }; + return { + content: { + textContent: "sub", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; }); await strategy.scrape(options, progressCallback); const processedUrls = strategy.processItem.mock.calls.map((call) => call[0].url); @@ -358,19 +454,36 @@ describe("BaseScraperStrategy", () => { maxDepth: 1, excludePatterns: ["docs/private/*"], }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); strategy.processItem.mockImplementation(async (item: QueueItem) => { if (item.url === "https://example.com/docs/start") { return { - document: { content: "main", metadata: {} }, + content: { + textContent: "main", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [ "https://example.com/docs/intro", "https://example.com/docs/private/secret", "https://example.com/docs/other", ], + status: FetchStatus.SUCCESS, }; } - return { document: { content: "sub", metadata: {} }, links: [] }; + return { + content: { + textContent: "sub", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; }); await strategy.scrape(options, progressCallback); const processedUrls = strategy.processItem.mock.calls.map((call) => call[0].url); @@ -388,19 +501,36 @@ describe("BaseScraperStrategy", () => { maxDepth: 1, excludePatterns: ["/private/"], }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); strategy.processItem.mockImplementation(async (item: QueueItem) => { if (item.url === "https://example.com/docs/start") { return { - document: { content: "main", metadata: {} }, + content: { + textContent: "main", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [ "https://example.com/docs/intro", "https://example.com/docs/private/secret", "https://example.com/docs/other", ], + status: FetchStatus.SUCCESS, }; } - return { document: { content: "sub", metadata: {} }, links: [] }; + return { + content: { + textContent: "sub", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; }); await strategy.scrape(options, progressCallback); const processedUrls = strategy.processItem.mock.calls.map((call) => call[0].url); @@ -419,19 +549,36 @@ describe("BaseScraperStrategy", () => { includePatterns: ["docs/*"], excludePatterns: ["docs/private/*"], }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); strategy.processItem.mockImplementation(async (item: QueueItem) => { if (item.url === "https://example.com/docs/start") { return { - document: { content: "main", metadata: {} }, + content: { + textContent: "main", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [ "https://example.com/docs/intro", "https://example.com/docs/private/secret", "https://example.com/docs/other", ], + status: FetchStatus.SUCCESS, }; } - return { document: { content: "sub", metadata: {} }, links: [] }; + return { + content: { + textContent: "sub", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; }); await strategy.scrape(options, progressCallback); const processedUrls = strategy.processItem.mock.calls.map((call) => call[0].url); diff --git a/src/scraper/strategies/BaseScraperStrategy.ts b/src/scraper/strategies/BaseScraperStrategy.ts index 67aeab9e..7aaa7f18 100644 --- a/src/scraper/strategies/BaseScraperStrategy.ts +++ b/src/scraper/strategies/BaseScraperStrategy.ts @@ -1,10 +1,18 @@ import { URL } from "node:url"; import { CancellationError } from "../../pipeline/errors"; -import type { Document, ProgressCallback } from "../../types"; +import type { ProgressCallback } from "../../types"; import { DEFAULT_MAX_PAGES } from "../../utils/config"; import { logger } from "../../utils/logger"; import { normalizeUrl, type UrlNormalizerOptions } from "../../utils/url"; -import type { ScraperOptions, ScraperProgress, ScraperStrategy } from "../types"; +import { FetchStatus } from "../fetcher/types"; +import type { PipelineResult } from "../pipelines/types"; +import type { + QueueItem, + ScrapeResult, + ScraperOptions, + ScraperProgressEvent, + ScraperStrategy, +} from "../types"; import { shouldIncludeUrl } from "../utils/patternMatcher"; import { isInScope } from "../utils/scope"; @@ -12,15 +20,35 @@ import { isInScope } from "../utils/scope"; const DEFAULT_MAX_DEPTH = 3; const DEFAULT_CONCURRENCY = 3; -export type QueueItem = { - url: string; - depth: number; -}; - export interface BaseScraperStrategyOptions { urlNormalizerOptions?: UrlNormalizerOptions; } +/** + * Result of processing a single queue item. + * - processed: The processed content (when available) + * - links: Discovered links for crawling (may exist without content, e.g., directories) + * - status: The fetch status (SUCCESS, NOT_MODIFIED, NOT_FOUND) + */ +export interface ProcessItemResult { + /** The URL of the content */ + url: string; + /** The title of the page or document, extracted during processing */ + title?: string | null; + /** The MIME type of the content being processed, if known */ + contentType?: string | null; + /** The ETag header value from the HTTP response, if available, used for caching and change detection. */ + etag?: string | null; + /** The Last-Modified header value, if available, used for caching and change detection. */ + lastModified?: string | null; + /** The pipeline-processed content, including title, text content, links, errors, and chunks. This may be null if the content was not successfully processed (e.g., 404 or 304). */ + content?: PipelineResult; + /** Extracted links from the content. This may be an empty array if no links were found or if the content was not processed. */ + links?: string[]; + /** Any non-critical errors encountered during processing. This may be an empty array if no errors were encountered or if the content was not processed. */ + status: FetchStatus; +} + export abstract class BaseScraperStrategy implements ScraperStrategy { protected visited = new Set(); protected pageCount = 0; @@ -56,26 +84,19 @@ export abstract class BaseScraperStrategy implements ScraperStrategy { /** * Process a single item from the queue. * - * @returns A list of URLs to add to the queue + * @returns Processed content, links, and metadata */ protected abstract processItem( item: QueueItem, options: ScraperOptions, - progressCallback?: ProgressCallback, - signal?: AbortSignal, // Add signal - ): Promise<{ - document?: Document; - links?: string[]; - finalUrl?: string; // Effective fetched URL (post-redirect) - }>; - - // Removed getProcessor method as processing is now handled by strategies using middleware pipelines + signal?: AbortSignal, + ): Promise; protected async processBatch( batch: QueueItem[], baseUrl: URL, options: ScraperOptions, - progressCallback: ProgressCallback, + progressCallback: ProgressCallback, signal?: AbortSignal, // Add signal ): Promise { const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES; @@ -93,31 +114,76 @@ export abstract class BaseScraperStrategy implements ScraperStrategy { try { // Pass signal to processItem - const result = await this.processItem(item, options, undefined, signal); - // If this is the root (depth 0) and we have a finalUrl differing from original, set canonicalBaseUrl - if (item.depth === 0 && !this.canonicalBaseUrl && result?.finalUrl) { - try { - const finalUrlStr = result.finalUrl as string; - const original = new URL(options.url); - const finalUrlObj = new URL(finalUrlStr); - if ( - finalUrlObj.href !== original.href && - (finalUrlObj.protocol === "http:" || finalUrlObj.protocol === "https:") - ) { - this.canonicalBaseUrl = finalUrlObj; - logger.debug( - `Updated scope base after redirect: ${original.href} -> ${finalUrlObj.href}`, - ); + const result = await this.processItem(item, options, signal); + + // Handle different fetch statuses + switch (result.status) { + case FetchStatus.NOT_MODIFIED: + // File/page hasn't changed, skip processing + logger.debug(`Page unchanged (304): ${item.url}`); + return []; + + case FetchStatus.NOT_FOUND: + // File/page was deleted + if (item.pageId) { + // Signal deletion to the pipeline for refresh operations + this.pageCount++; + logger.info(`Page deleted (404): ${item.url}`); + await progressCallback({ + pagesScraped: this.pageCount, + totalPages: this.effectiveTotal, + totalDiscovered: this.totalDiscovered, + currentUrl: item.url, + depth: item.depth, + maxDepth: options.maxDepth ?? DEFAULT_MAX_DEPTH, + result: null, + pageId: item.pageId, + deleted: true, + }); } else { - this.canonicalBaseUrl = original; + logger.warn(`Page not found (404): ${item.url}`); } - } catch { - // Ignore canonical base errors - this.canonicalBaseUrl = new URL(options.url); - } + return []; + + case FetchStatus.SUCCESS: + // Continue with normal processing + break; + + default: + logger.error(`Unknown fetch status: ${result.status}`); + return []; } - if (result.document) { + // FIXME: I believe this is no longer required + // // If this is the root (depth 0) and we have processed content with a URL, check for redirects + // if (item.depth === 0 && !this.canonicalBaseUrl && result?.processed) { + // try { + // const finalUrlStr = result.processed.metadata.url as string | undefined; + // if (finalUrlStr) { + // const original = new URL(options.url); + // const finalUrlObj = new URL(finalUrlStr); + // if ( + // finalUrlObj.href !== original.href && + // (finalUrlObj.protocol === "http:" || finalUrlObj.protocol === "https:") + // ) { + // this.canonicalBaseUrl = finalUrlObj; + // logger.debug( + // `Updated scope base after redirect: ${original.href} -> ${finalUrlObj.href}`, + // ); + // } else { + // this.canonicalBaseUrl = original; + // } + // } else { + // this.canonicalBaseUrl = new URL(options.url); + // } + // } catch { + // // Ignore canonical base errors + // this.canonicalBaseUrl = new URL(options.url); + // } + // } + + // Handle successful processing + if (result.content) { this.pageCount++; // maxDepth already resolved above logger.info( @@ -130,7 +196,18 @@ export abstract class BaseScraperStrategy implements ScraperStrategy { currentUrl: item.url, depth: item.depth, maxDepth: maxDepth, - document: result.document, + result: { + url: item.url, + title: result.content.title?.trim() || result.title?.trim() || "", + contentType: result.contentType || "", + textContent: result.content.textContent || "", + links: result.content.links || [], + errors: result.content.errors || [], + chunks: result.content.chunks || [], + etag: result.etag || null, + lastModified: result.lastModified || null, + } satisfies ScrapeResult, + pageId: item.pageId, }); } @@ -190,46 +267,68 @@ export abstract class BaseScraperStrategy implements ScraperStrategy { async scrape( options: ScraperOptions, - progressCallback: ProgressCallback, + progressCallback: ProgressCallback, signal?: AbortSignal, // Add signal ): Promise { this.visited.clear(); this.pageCount = 0; - this.totalDiscovered = 1; // Start with the initial URL (unlimited counter) - this.effectiveTotal = 1; // Start with the initial URL (limited counter) + // Check if this is a refresh operation with pre-populated queue + const initialQueue = options.initialQueue || []; + const isRefreshMode = initialQueue.length > 0; + + // Initialize queue and tracking + if (isRefreshMode) { + // Initialize from provided queue + this.totalDiscovered = initialQueue.length; + this.effectiveTotal = initialQueue.length; + + // Mark all URLs in the initial queue as visited to prevent re-discovery + for (const item of initialQueue) { + this.visited.add(normalizeUrl(item.url, this.options.urlNormalizerOptions)); + } + + logger.debug( + `Starting refresh mode with ${initialQueue.length} pre-populated pages`, + ); + } else { + // Normal scraping mode + this.totalDiscovered = 1; // Start with the initial URL (unlimited counter) + this.effectiveTotal = 1; // Start with the initial URL (limited counter) + + // Track the initial URL as visited + this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions)); + } + + // Set up base URL and queue this.canonicalBaseUrl = new URL(options.url); let baseUrl = this.canonicalBaseUrl; - const queue = [{ url: options.url, depth: 0 } satisfies QueueItem]; - - // Track values we've seen (either queued or visited) - this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions)); + const queue: QueueItem[] = isRefreshMode + ? [...initialQueue] + : [{ url: options.url, depth: 0 } satisfies QueueItem]; // Resolve optional values to defaults using temporary variables const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES; const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY; + // Unified processing loop for both normal and refresh modes while (queue.length > 0 && this.pageCount < maxPages) { - // Use variable // Check for cancellation at the start of each loop iteration if (signal?.aborted) { - logger.debug("Scraping cancelled by signal."); - throw new CancellationError("Scraping cancelled by signal"); + logger.debug(`${isRefreshMode ? "Refresh" : "Scraping"} cancelled by signal.`); + throw new CancellationError( + `${isRefreshMode ? "Refresh" : "Scraping"} cancelled by signal`, + ); } - const remainingPages = maxPages - this.pageCount; // Use variable + const remainingPages = maxPages - this.pageCount; if (remainingPages <= 0) { break; } - const batchSize = Math.min( - maxConcurrency, // Use variable - remainingPages, - queue.length, - ); - + const batchSize = Math.min(maxConcurrency, remainingPages, queue.length); const batch = queue.splice(0, batchSize); - // Pass signal to processBatch + // Always use latest canonical base (may have been updated after first fetch) baseUrl = this.canonicalBaseUrl ?? baseUrl; const newUrls = await this.processBatch( diff --git a/src/scraper/strategies/GitHubRepoScraperStrategy.test.ts b/src/scraper/strategies/GitHubRepoScraperStrategy.test.ts index 14407bb8..0671318e 100644 --- a/src/scraper/strategies/GitHubRepoScraperStrategy.test.ts +++ b/src/scraper/strategies/GitHubRepoScraperStrategy.test.ts @@ -1,8 +1,9 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; -import { HttpFetcher } from "../fetcher"; +import { FetchStatus, HttpFetcher } from "../fetcher"; import type { RawContent } from "../fetcher/types"; import { HtmlPipeline } from "../pipelines/HtmlPipeline"; import { MarkdownPipeline } from "../pipelines/MarkdownPipeline"; +import type { PipelineResult } from "../pipelines/types"; import type { ScraperOptions } from "../types"; import { GitHubRepoScraperStrategy } from "./GitHubRepoScraperStrategy"; @@ -130,7 +131,7 @@ describe("GitHubRepoScraperStrategy", () => { httpFetcherInstance.fetch.mockImplementation((url: string) => { if (url.includes("api.github.com/repos")) { return Promise.resolve({ - content: JSON.stringify({ default_branch: "main" }), + textContent: JSON.stringify({ default_branch: "main" }), mimeType: "application/json", source: url, charset: "utf-8", @@ -138,7 +139,7 @@ describe("GitHubRepoScraperStrategy", () => { } if (url.includes("git/trees")) { return Promise.resolve({ - content: JSON.stringify({ + textContent: JSON.stringify({ sha: "tree123", url: "https://api.github.com/repos/owner/repo/git/trees/tree123", tree: [ @@ -172,7 +173,7 @@ describe("GitHubRepoScraperStrategy", () => { }); } return Promise.resolve({ - content: "file content", + textContent: "file content", mimeType: "text/plain", source: url, charset: "utf-8", @@ -218,13 +219,13 @@ describe("GitHubRepoScraperStrategy", () => { resolvedBranch: "main", }); - const result = await (strategy as any).processItem(item, options); + const result = await strategy.processItem(item, options); expect(result.links).toEqual([ "github-file://README.md", "github-file://src/index.js", ]); - expect(result.document).toBeUndefined(); + expect(result.content).toBeUndefined(); // Clean up the spy mockFetchRepositoryTree.mockRestore(); @@ -236,10 +237,10 @@ describe("GitHubRepoScraperStrategy", () => { url: "https://github.com/owner/repo/blob/main/README.md", }; const item = { url: "https://github.com/owner/repo/blob/main/README.md", depth: 0 }; - const result = await (strategy as any).processItem(item, blobOptions); + const result = await strategy.processItem(item, blobOptions); expect(result.links).toEqual(["github-file://README.md"]); - expect(result.document).toBeUndefined(); + expect(result.content).toBeUndefined(); }); it("should handle blob URL without file path", async () => { @@ -248,10 +249,10 @@ describe("GitHubRepoScraperStrategy", () => { url: "https://github.com/owner/repo/blob/main", }; const item = { url: "https://github.com/owner/repo/blob/main", depth: 0 }; - const result = await (strategy as any).processItem(item, blobOptions); + const result = await strategy.processItem(item, blobOptions); expect(result.links).toEqual([]); - expect(result.document).toBeUndefined(); + expect(result.content).toBeUndefined(); }); it("should process individual file content", async () => { @@ -260,11 +261,13 @@ describe("GitHubRepoScraperStrategy", () => { mimeType: "text/markdown", source: "https://raw.githubusercontent.com/owner/repo/main/README.md", charset: "utf-8", + status: FetchStatus.SUCCESS, }; - const processedContent = { + const processedContent: PipelineResult = { textContent: "Test File\nThis is a test markdown file.", - metadata: { title: "Test File" }, + title: "Test File", + chunks: [], errors: [], links: [], }; @@ -274,18 +277,14 @@ describe("GitHubRepoScraperStrategy", () => { markdownPipelineInstance.process.mockResolvedValue(processedContent); const item = { url: "github-file://README.md", depth: 1 }; - const result = await (strategy as any).processItem(item, options); - - expect(result.document).toEqual({ - content: "Test File\nThis is a test markdown file.", - contentType: "text/markdown", - metadata: { - url: "https://github.com/owner/repo/blob/main/README.md", - title: "Test File", - library: "test-lib", - version: "1.0.0", - }, - }); + const result = await strategy.processItem(item, options); + + expect(result.content?.textContent).toBe( + "Test File\nThis is a test markdown file.", + ); + expect(result.contentType).toBe("text/markdown"); + expect(result.url).toBe("https://github.com/owner/repo/blob/main/README.md"); + expect(result.content?.title).toBe("Test File"); expect(result.links).toEqual([]); }); @@ -295,11 +294,13 @@ describe("GitHubRepoScraperStrategy", () => { mimeType: "text/plain", source: "https://raw.githubusercontent.com/owner/repo/main/config.txt", charset: "utf-8", + status: FetchStatus.SUCCESS, }; - const processedContent = { + const processedContent: PipelineResult = { textContent: "Some content without title", - metadata: { title: "" }, + title: "", + chunks: [], errors: [], links: [], }; @@ -309,9 +310,9 @@ describe("GitHubRepoScraperStrategy", () => { markdownPipelineInstance.process.mockResolvedValue(processedContent); const item = { url: "github-file://config.txt", depth: 1 }; - const result = await (strategy as any).processItem(item, options); + const result = await strategy.processItem(item, options); - expect(result.document?.metadata.title).toBe("config.txt"); + expect(result.title).toBe("config.txt"); }); it("should handle unsupported content types", async () => { @@ -320,6 +321,7 @@ describe("GitHubRepoScraperStrategy", () => { mimeType: "application/octet-stream", source: "https://raw.githubusercontent.com/owner/repo/main/binary.bin", charset: "utf-8", + status: FetchStatus.SUCCESS, }; httpFetcherInstance.fetch.mockResolvedValue(rawContent); @@ -327,9 +329,9 @@ describe("GitHubRepoScraperStrategy", () => { markdownPipelineInstance.canProcess.mockReturnValue(false); const item = { url: "github-file://binary.bin", depth: 1 }; - const result = await (strategy as any).processItem(item, options); + const result = await strategy.processItem(item, options); - expect(result.document).toBeUndefined(); + expect(result.content).toBeUndefined(); expect(result.links).toEqual([]); }); }); @@ -352,7 +354,8 @@ describe("GitHubRepoScraperStrategy", () => { ]; for (const file of textFiles) { - expect((strategy as any).shouldProcessFile(file, options)).toBe(true); + // @ts-expect-error Accessing private method for testing + expect(strategy.shouldProcessFile(file, options)).toBe(true); } }); @@ -365,13 +368,15 @@ describe("GitHubRepoScraperStrategy", () => { ]; for (const file of binaryFiles) { - expect((strategy as any).shouldProcessFile(file, options)).toBe(false); + // @ts-expect-error Accessing private method for testing + expect(strategy.shouldProcessFile(file, options)).toBe(false); } }); it("should skip tree items", () => { const treeItem = { path: "src", type: "tree" as const }; - expect((strategy as any).shouldProcessFile(treeItem, options)).toBe(false); + // @ts-expect-error Accessing private method for testing + expect(strategy.shouldProcessFile(treeItem, options)).toBe(false); }); it("should respect include patterns", () => { @@ -381,20 +386,38 @@ describe("GitHubRepoScraperStrategy", () => { }; expect( - (strategy as any).shouldProcessFile( - { path: "README.md", type: "blob" as const }, + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { + path: "README.md", + type: "blob" as const, + sha: "abc123", + url: "https://api.github.com/repos/owner/repo/git/blobs/abc123", + }, optionsWithInclude, ), ).toBe(true); expect( - (strategy as any).shouldProcessFile( - { path: "src/index.js", type: "blob" as const }, + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { + path: "src/index.js", + type: "blob" as const, + sha: "def456", + url: "https://api.github.com/repos/owner/repo/git/blobs/def456", + }, optionsWithInclude, ), ).toBe(true); expect( - (strategy as any).shouldProcessFile( - { path: "package.json", type: "blob" as const }, + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { + path: "package.json", + type: "blob" as const, + sha: "ghi789", + url: "https://api.github.com/repos/owner/repo/git/blobs/ghi789", + }, optionsWithInclude, ), ).toBe(false); @@ -407,20 +430,38 @@ describe("GitHubRepoScraperStrategy", () => { }; expect( - (strategy as any).shouldProcessFile( - { path: "src/index.js", type: "blob" as const }, + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { + path: "src/index.js", + type: "blob" as const, + sha: "abc123", + url: "https://api.github.com/repos/owner/repo/git/blobs/abc123", + }, optionsWithExclude, ), ).toBe(true); expect( - (strategy as any).shouldProcessFile( - { path: "src/index.test.js", type: "blob" as const }, + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { + path: "src/index.test.js", + type: "blob" as const, + sha: "def456", + url: "https://api.github.com/repos/owner/repo/git/blobs/def456", + }, optionsWithExclude, ), ).toBe(false); expect( - (strategy as any).shouldProcessFile( - { path: "node_modules/package/index.js", type: "blob" as const }, + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { + path: "node_modules/package/index.js", + type: "blob" as const, + sha: "ghi789", + url: "https://api.github.com/repos/owner/repo/git/blobs/ghi789", + }, optionsWithExclude, ), ).toBe(false); diff --git a/src/scraper/strategies/GitHubRepoScraperStrategy.ts b/src/scraper/strategies/GitHubRepoScraperStrategy.ts index 576fe484..770a981c 100644 --- a/src/scraper/strategies/GitHubRepoScraperStrategy.ts +++ b/src/scraper/strategies/GitHubRepoScraperStrategy.ts @@ -1,13 +1,14 @@ -import type { Document, ProgressCallback } from "../../types"; +import type { ProgressCallback } from "../../types"; import { logger } from "../../utils/logger"; import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; import { HttpFetcher } from "../fetcher"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import { PipelineFactory } from "../pipelines/PipelineFactory"; -import type { ContentPipeline } from "../pipelines/types"; -import { ScrapeMode, type ScraperOptions, type ScraperProgress } from "../types"; +import type { ContentPipeline, PipelineResult } from "../pipelines/types"; +import type { QueueItem } from "../types"; +import { ScrapeMode, type ScraperOptions, type ScraperProgressEvent } from "../types"; import { shouldIncludeUrl } from "../utils/patternMatcher"; -import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy"; +import { BaseScraperStrategy, type ProcessItemResult } from "./BaseScraperStrategy"; interface GitHubRepoInfo { owner: string; @@ -364,14 +365,14 @@ export class GitHubRepoScraperStrategy extends BaseScraperStrategy { return rawContent; } - protected async processItem( + async processItem( item: QueueItem, options: ScraperOptions, - _progressCallback?: ProgressCallback, signal?: AbortSignal, - ): Promise<{ document?: Document; links?: string[] }> { + ): Promise { // Parse the URL to get repository information const repoInfo = this.parseGitHubUrl(options.url); + const pageCount = this.pageCount; // For the initial item, handle blob URLs differently than tree URLs if (item.depth === 0) { @@ -383,13 +384,17 @@ export class GitHubRepoScraperStrategy extends BaseScraperStrategy { ); // Process the single file directly - return { links: [`github-file://${repoInfo.filePath}`] }; + return { + url: item.url, + links: [`github-file://${repoInfo.filePath}`], + status: FetchStatus.SUCCESS, + }; } else { // Blob URL without file path - return empty links logger.warn( `⚠️ Blob URL without file path: ${options.url}. No files to process.`, ); - return { links: [] }; + return { url: item.url, links: [], status: FetchStatus.SUCCESS }; } } @@ -410,24 +415,25 @@ export class GitHubRepoScraperStrategy extends BaseScraperStrategy { // Convert tree items to URLs for the queue const links = fileItems.map((treeItem) => `github-file://${treeItem.path}`); - return { links }; + return { url: item.url, links, status: FetchStatus.SUCCESS }; } // Process individual files if (item.url.startsWith("github-file://")) { const filePath = item.url.replace("github-file://", ""); - logger.info( - `🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`, - ); + logger.info(`🗂️ Processing file ${pageCount}/${options.maxPages}: ${filePath}`); const rawContent = await this.fetchFileContent(repoInfo, filePath, signal); // Process content through appropriate pipeline - let processed: Awaited> | undefined; + let processed: PipelineResult | undefined; for (const pipeline of this.pipelines) { - if (pipeline.canProcess(rawContent)) { + const contentBuffer = Buffer.isBuffer(rawContent.content) + ? rawContent.content + : Buffer.from(rawContent.content); + if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) { logger.debug( `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`, ); @@ -447,10 +453,10 @@ export class GitHubRepoScraperStrategy extends BaseScraperStrategy { logger.warn( `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`, ); - return { document: undefined, links: [] }; + return { url: item.url, links: [], status: FetchStatus.SUCCESS }; } - for (const err of processed.errors) { + for (const err of processed.errors ?? []) { logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`); } @@ -458,29 +464,21 @@ export class GitHubRepoScraperStrategy extends BaseScraperStrategy { const githubUrl = `https://github.com/${repoInfo.owner}/${repoInfo.repo}/blob/${this.resolvedBranch || repoInfo.branch || "main"}/${filePath}`; // Use filename as fallback if title is empty or not a string - const processedTitle = processed.metadata.title; - const hasValidTitle = - typeof processedTitle === "string" && processedTitle.trim() !== ""; - const fallbackTitle = filePath.split("/").pop() || "Untitled"; + const filename = filePath.split("/").pop() || "Untitled"; return { - document: { - content: typeof processed.textContent === "string" ? processed.textContent : "", - metadata: { - url: githubUrl, - title: hasValidTitle ? processedTitle : fallbackTitle, - library: options.library, - version: options.version, - etag: rawContent.etag, - lastModified: rawContent.lastModified, - }, - contentType: rawContent.mimeType, // Preserve the detected MIME type - } satisfies Document, + url: githubUrl, + title: processed.title?.trim() || filename || "Untitled", + etag: rawContent.etag, + lastModified: rawContent.lastModified, + contentType: rawContent.mimeType, + content: processed, links: [], // Always return empty links array for individual files + status: FetchStatus.SUCCESS, }; } - return { document: undefined, links: [] }; + return { url: item.url, links: [], status: FetchStatus.SUCCESS }; } /** @@ -510,7 +508,7 @@ export class GitHubRepoScraperStrategy extends BaseScraperStrategy { async scrape( options: ScraperOptions, - progressCallback: ProgressCallback, + progressCallback: ProgressCallback, signal?: AbortSignal, ): Promise { // Validate it's a GitHub URL diff --git a/src/scraper/strategies/GitHubScraperStrategy.test.ts b/src/scraper/strategies/GitHubScraperStrategy.test.ts index 24bf66ff..bcddfdf7 100644 --- a/src/scraper/strategies/GitHubScraperStrategy.test.ts +++ b/src/scraper/strategies/GitHubScraperStrategy.test.ts @@ -1,4 +1,6 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; +import type { ProgressCallback } from "../../types"; +import type { ScraperProgressEvent } from "../types"; import { GitHubRepoScraperStrategy } from "./GitHubRepoScraperStrategy"; import { GitHubScraperStrategy } from "./GitHubScraperStrategy"; import { GitHubWikiScraperStrategy } from "./GitHubWikiScraperStrategy"; @@ -77,7 +79,7 @@ describe("GitHubScraperStrategy", () => { version: "1.0.0", }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); repoStrategyInstance.scrape.mockResolvedValue(undefined); wikiStrategyInstance.scrape.mockResolvedValue(undefined); @@ -112,7 +114,7 @@ describe("GitHubScraperStrategy", () => { version: "1.0.0", }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); repoStrategyInstance.scrape.mockResolvedValue(undefined); wikiStrategyInstance.scrape.mockRejectedValue(new Error("Wiki not found")); @@ -131,7 +133,7 @@ describe("GitHubScraperStrategy", () => { version: "1.0.0", }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await expect(strategy.scrape(options, progressCallback)).rejects.toThrow( "URL must be a GitHub URL", @@ -145,7 +147,7 @@ describe("GitHubScraperStrategy", () => { version: "1.0.0", }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await expect(strategy.scrape(options, progressCallback)).rejects.toThrow( "URL must be a base GitHub repository URL", diff --git a/src/scraper/strategies/GitHubScraperStrategy.ts b/src/scraper/strategies/GitHubScraperStrategy.ts index 55df19aa..cfc1bea0 100644 --- a/src/scraper/strategies/GitHubScraperStrategy.ts +++ b/src/scraper/strategies/GitHubScraperStrategy.ts @@ -1,6 +1,6 @@ import type { ProgressCallback } from "../../types"; import { logger } from "../../utils/logger"; -import type { ScraperOptions, ScraperProgress, ScraperStrategy } from "../types"; +import type { ScraperOptions, ScraperProgressEvent, ScraperStrategy } from "../types"; import { GitHubRepoScraperStrategy } from "./GitHubRepoScraperStrategy"; import { GitHubWikiScraperStrategy } from "./GitHubWikiScraperStrategy"; @@ -48,7 +48,7 @@ export class GitHubScraperStrategy implements ScraperStrategy { async scrape( options: ScraperOptions, - progressCallback: ProgressCallback, + progressCallback: ProgressCallback, signal?: AbortSignal, ): Promise { // Validate it's a GitHub URL @@ -72,7 +72,7 @@ export class GitHubScraperStrategy implements ScraperStrategy { let wikiCompleted = false; let repoCompleted = false; - const mergedProgressCallback: ProgressCallback = async ( + const mergedProgressCallback: ProgressCallback = async ( progress, ) => { // For the first strategy (wiki), accumulate discovered pages and scraped count diff --git a/src/scraper/strategies/GitHubWikiScraperStrategy.test.ts b/src/scraper/strategies/GitHubWikiScraperStrategy.test.ts index 868fc30b..c2ebb366 100644 --- a/src/scraper/strategies/GitHubWikiScraperStrategy.test.ts +++ b/src/scraper/strategies/GitHubWikiScraperStrategy.test.ts @@ -1,8 +1,9 @@ import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; -import { HttpFetcher } from "../fetcher"; +import { FetchStatus, HttpFetcher } from "../fetcher"; import type { RawContent } from "../fetcher/types"; import { HtmlPipeline } from "../pipelines/HtmlPipeline"; import { MarkdownPipeline } from "../pipelines/MarkdownPipeline"; +import type { PipelineResult } from "../pipelines/types"; import { ScrapeMode, type ScraperOptions } from "../types"; import { GitHubWikiScraperStrategy } from "./GitHubWikiScraperStrategy"; @@ -85,9 +86,7 @@ describe("GitHubWikiScraperStrategy", () => { describe("parseGitHubWikiUrl", () => { it("should parse basic wiki URL", () => { - const result = (strategy as any).parseGitHubWikiUrl( - "https://github.com/owner/repo/wiki", - ); + const result = strategy.parseGitHubWikiUrl("https://github.com/owner/repo/wiki"); expect(result).toEqual({ owner: "owner", repo: "repo", @@ -95,9 +94,7 @@ describe("GitHubWikiScraperStrategy", () => { }); it("should parse wiki URL with trailing slash", () => { - const result = (strategy as any).parseGitHubWikiUrl( - "https://github.com/owner/repo/wiki/", - ); + const result = strategy.parseGitHubWikiUrl("https://github.com/owner/repo/wiki/"); expect(result).toEqual({ owner: "owner", repo: "repo", @@ -105,7 +102,7 @@ describe("GitHubWikiScraperStrategy", () => { }); it("should parse wiki URL with specific page", () => { - const result = (strategy as any).parseGitHubWikiUrl( + const result = strategy.parseGitHubWikiUrl( "https://github.com/owner/repo/wiki/Home", ); expect(result).toEqual({ @@ -115,7 +112,7 @@ describe("GitHubWikiScraperStrategy", () => { }); it("should parse wiki URL with complex page name", () => { - const result = (strategy as any).parseGitHubWikiUrl( + const result = strategy.parseGitHubWikiUrl( "https://github.com/owner/repo/wiki/Getting-Started-Guide", ); expect(result).toEqual({ @@ -125,7 +122,7 @@ describe("GitHubWikiScraperStrategy", () => { }); it("should handle www subdomain", () => { - const result = (strategy as any).parseGitHubWikiUrl( + const result = strategy.parseGitHubWikiUrl( "https://www.github.com/owner/repo/wiki", ); expect(result).toEqual({ @@ -136,11 +133,11 @@ describe("GitHubWikiScraperStrategy", () => { it("should throw error for invalid wiki URL", () => { expect(() => { - (strategy as any).parseGitHubWikiUrl("https://github.com/invalid"); + strategy.parseGitHubWikiUrl("https://github.com/invalid"); }).toThrow("Invalid GitHub wiki URL"); expect(() => { - (strategy as any).parseGitHubWikiUrl("https://github.com/owner/repo"); + strategy.parseGitHubWikiUrl("https://github.com/owner/repo"); }).toThrow("Invalid GitHub wiki URL"); }); }); @@ -154,19 +151,16 @@ describe("GitHubWikiScraperStrategy", () => { it("should process URLs within the same wiki", () => { expect( - (strategy as any).shouldProcessUrl( - "https://github.com/owner/repo/wiki/Home", - options, - ), + // @ts-expect-error - testing internal method + strategy.shouldProcessUrl("https://github.com/owner/repo/wiki/Home", options), ).toBe(true); expect( - (strategy as any).shouldProcessUrl( - "https://github.com/owner/repo/wiki/API", - options, - ), + // @ts-expect-error - testing internal method + strategy.shouldProcessUrl("https://github.com/owner/repo/wiki/API", options), ).toBe(true); expect( - (strategy as any).shouldProcessUrl( + // @ts-expect-error - testing internal method + strategy.shouldProcessUrl( "https://github.com/owner/repo/wiki/Getting-Started", options, ), @@ -175,19 +169,16 @@ describe("GitHubWikiScraperStrategy", () => { it("should not process URLs outside the wiki", () => { expect( - (strategy as any).shouldProcessUrl("https://github.com/owner/repo", options), + // @ts-expect-error - testing internal method + strategy.shouldProcessUrl("https://github.com/owner/repo", options), ).toBe(false); expect( - (strategy as any).shouldProcessUrl( - "https://github.com/owner/repo/tree/main", - options, - ), + // @ts-expect-error - testing internal method + strategy.shouldProcessUrl("https://github.com/owner/repo/tree/main", options), ).toBe(false); expect( - (strategy as any).shouldProcessUrl( - "https://github.com/other/repo/wiki/Home", - options, - ), + // @ts-expect-error - testing internal method + strategy.shouldProcessUrl("https://github.com/other/repo/wiki/Home", options), ).toBe(false); }); @@ -198,19 +189,22 @@ describe("GitHubWikiScraperStrategy", () => { }; expect( - (strategy as any).shouldProcessUrl( + // @ts-expect-error - testing internal method + strategy.shouldProcessUrl( "https://github.com/owner/repo/wiki/API-Reference", optionsWithInclude, ), ).toBe(true); expect( - (strategy as any).shouldProcessUrl( + // @ts-expect-error - testing internal method + strategy.shouldProcessUrl( "https://github.com/owner/repo/wiki/Getting-Started", optionsWithInclude, ), ).toBe(true); expect( - (strategy as any).shouldProcessUrl( + // @ts-expect-error - testing internal method + strategy.shouldProcessUrl( "https://github.com/owner/repo/wiki/Home", optionsWithInclude, ), @@ -224,19 +218,22 @@ describe("GitHubWikiScraperStrategy", () => { }; expect( - (strategy as any).shouldProcessUrl( + // @ts-expect-error - testing internal method + strategy.shouldProcessUrl( "https://github.com/owner/repo/wiki/deprecated-api", optionsWithExclude, ), ).toBe(false); expect( - (strategy as any).shouldProcessUrl( + // @ts-expect-error - testing internal method + strategy.shouldProcessUrl( "https://github.com/owner/repo/wiki/old-guide", optionsWithExclude, ), ).toBe(false); expect( - (strategy as any).shouldProcessUrl( + // @ts-expect-error - testing internal method + strategy.shouldProcessUrl( "https://github.com/owner/repo/wiki/current-guide", optionsWithExclude, ), @@ -245,19 +242,20 @@ describe("GitHubWikiScraperStrategy", () => { it("should handle Home page as default", () => { expect( - (strategy as any).shouldProcessUrl("https://github.com/owner/repo/wiki", options), + // @ts-expect-error - testing internal method + strategy.shouldProcessUrl("https://github.com/owner/repo/wiki", options), ).toBe(true); expect( - (strategy as any).shouldProcessUrl( - "https://github.com/owner/repo/wiki/", - options, - ), + // @ts-expect-error - testing internal method + strategy.shouldProcessUrl("https://github.com/owner/repo/wiki/", options), ).toBe(true); }); it("should handle malformed URLs gracefully", () => { - expect((strategy as any).shouldProcessUrl("invalid-url", options)).toBe(false); - expect((strategy as any).shouldProcessUrl("", options)).toBe(false); + // @ts-expect-error - testing internal method + expect(strategy.shouldProcessUrl("invalid-url", options)).toBe(false); + // @ts-expect-error - testing internal method + expect(strategy.shouldProcessUrl("", options)).toBe(false); }); }); @@ -288,12 +286,14 @@ describe("GitHubWikiScraperStrategy", () => { mimeType: "text/html", source: "https://github.com/owner/repo/wiki/Home", charset: "utf-8", + status: FetchStatus.SUCCESS, }; - const processedContent = { + const processedContent: PipelineResult = { textContent: "Wiki Home\n\nWelcome to the Wiki\n\nThis is the home page of our documentation.", - metadata: { title: "Wiki Home" }, + title: "Wiki Home", + chunks: [], errors: [], links: [ "/owner/repo/wiki/API", @@ -307,18 +307,15 @@ describe("GitHubWikiScraperStrategy", () => { htmlPipelineInstance.process.mockResolvedValue(processedContent); const item = { url: "https://github.com/owner/repo/wiki/Home", depth: 1 }; - const result = await (strategy as any).processItem(item, options); + const result = await strategy.processItem(item, options); - expect(result.document).toEqual({ - content: + expect(result.content).toEqual({ + textContent: "Wiki Home\n\nWelcome to the Wiki\n\nThis is the home page of our documentation.", - contentType: "text/html", - metadata: { - url: "https://github.com/owner/repo/wiki/Home", - title: "Wiki Home", - library: "test-lib", - version: "1.0.0", - }, + title: "Wiki Home", + chunks: expect.any(Array), + links: expect.any(Array), + errors: expect.any(Array), }); // Should only include wiki links, not external links @@ -334,11 +331,12 @@ describe("GitHubWikiScraperStrategy", () => { mimeType: "text/html", source: "https://github.com/owner/repo/wiki/Getting-Started", charset: "utf-8", + status: FetchStatus.SUCCESS, }; - const processedContent = { + const processedContent: PipelineResult = { textContent: "Content without title", - metadata: { title: "" }, + chunks: [], errors: [], links: [], }; @@ -351,9 +349,9 @@ describe("GitHubWikiScraperStrategy", () => { url: "https://github.com/owner/repo/wiki/Getting-Started", depth: 1, }; - const result = await (strategy as any).processItem(item, options); + const result = await strategy.processItem(item, options); - expect(result.document?.metadata.title).toBe("Getting-Started"); + expect(result.title).toBe("Getting-Started"); }); it("should handle Home page title fallback", async () => { @@ -362,11 +360,12 @@ describe("GitHubWikiScraperStrategy", () => { mimeType: "text/html", source: "https://github.com/owner/repo/wiki", charset: "utf-8", + status: FetchStatus.SUCCESS, }; - const processedContent = { + const processedContent: PipelineResult = { textContent: "Home page content", - metadata: { title: "" }, + chunks: [], errors: [], links: [], }; @@ -376,9 +375,9 @@ describe("GitHubWikiScraperStrategy", () => { htmlPipelineInstance.process.mockResolvedValue(processedContent); const item = { url: "https://github.com/owner/repo/wiki", depth: 1 }; - const result = await (strategy as any).processItem(item, options); + const result = await strategy.processItem(item, options); - expect(result.document?.metadata.title).toBe("Home"); + expect(result.title).toBe("Home"); }); it("should force ScrapeMode.Fetch for consistent behavior", async () => { @@ -387,11 +386,13 @@ describe("GitHubWikiScraperStrategy", () => { mimeType: "text/html", source: "https://github.com/owner/repo/wiki/Test", charset: "utf-8", + status: FetchStatus.SUCCESS, }; - const processedContent = { + const processedContent: PipelineResult = { textContent: "Test", - metadata: { title: "Test" }, + title: "Test", + chunks: [], errors: [], links: [], }; @@ -411,7 +412,7 @@ describe("GitHubWikiScraperStrategy", () => { }; const item = { url: "https://github.com/owner/repo/wiki/Test", depth: 1 }; - await (strategy as any).processItem(item, optionsWithPlaywright); + await strategy.processItem(item, optionsWithPlaywright); expect(htmlPipelineInstance.process).toHaveBeenCalledWith( rawContent, @@ -426,6 +427,7 @@ describe("GitHubWikiScraperStrategy", () => { mimeType: "application/octet-stream", source: "https://github.com/owner/repo/wiki/Binary", charset: "utf-8", + status: FetchStatus.SUCCESS, }; httpFetcherInstance.fetch.mockResolvedValue(rawContent); @@ -433,9 +435,9 @@ describe("GitHubWikiScraperStrategy", () => { markdownPipelineInstance.canProcess.mockReturnValue(false); const item = { url: "https://github.com/owner/repo/wiki/Binary", depth: 1 }; - const result = await (strategy as any).processItem(item, options); + const result = await strategy.processItem(item, options); - expect(result.document).toBeUndefined(); + expect(result.content).toBeUndefined(); expect(result.links).toEqual([]); }); @@ -443,9 +445,9 @@ describe("GitHubWikiScraperStrategy", () => { httpFetcherInstance.fetch.mockRejectedValue(new Error("Network error")); const item = { url: "https://github.com/owner/repo/wiki/Unreachable", depth: 1 }; - const result = await (strategy as any).processItem(item, options); + const result = await strategy.processItem(item, options); - expect(result.document).toBeUndefined(); + expect(result.content).toBeUndefined(); expect(result.links).toEqual([]); }); @@ -455,11 +457,13 @@ describe("GitHubWikiScraperStrategy", () => { mimeType: "text/html", source: "https://github.com/owner/repo/wiki/Test", charset: "utf-8", + status: FetchStatus.SUCCESS, }; const processedContentWithErrors = { textContent: "Test", metadata: { title: "Test" }, + chunks: [], errors: [new Error("Processing warning")], links: [], }; @@ -469,10 +473,10 @@ describe("GitHubWikiScraperStrategy", () => { htmlPipelineInstance.process.mockResolvedValue(processedContentWithErrors); const item = { url: "https://github.com/owner/repo/wiki/Test", depth: 1 }; - const result = await (strategy as any).processItem(item, options); + const result = await strategy.processItem(item, options); - expect(result.document).toBeDefined(); - expect(result.document?.content).toBe("Test"); + expect(result.content).toBeDefined(); + expect(result.content?.textContent).toBe("Test"); }); }); @@ -600,11 +604,13 @@ describe("GitHubWikiScraperStrategy", () => { mimeType: "text/html", source: "https://github.com/owner/repo/wiki/Home", charset: "utf-8", + status: FetchStatus.SUCCESS, }; - const processedContent = { + const processedContent: PipelineResult = { textContent: "Content", - metadata: { title: "Test" }, + title: "Test", + chunks: [], errors: [], links: ["/owner/repo/wiki/API", "Getting-Started", "./Advanced-Topics"], }; @@ -614,7 +620,7 @@ describe("GitHubWikiScraperStrategy", () => { htmlPipelineInstance.process.mockResolvedValue(processedContent); const item = { url: "https://github.com/owner/repo/wiki/Home", depth: 1 }; - const result = await (strategy as any).processItem(item, options); + const result = await strategy.processItem(item, options); expect(result.links).toEqual([ "https://github.com/owner/repo/wiki/API", @@ -629,11 +635,13 @@ describe("GitHubWikiScraperStrategy", () => { mimeType: "text/html", source: "https://github.com/owner/repo/wiki/Home", charset: "utf-8", + status: FetchStatus.SUCCESS, }; - const processedContent = { + const processedContent: PipelineResult = { textContent: "Content", - metadata: { title: "Test" }, + title: "Test", + chunks: [], errors: [], links: [ "https://github.com/owner/repo/wiki/API", // Should include @@ -649,7 +657,7 @@ describe("GitHubWikiScraperStrategy", () => { htmlPipelineInstance.process.mockResolvedValue(processedContent); const item = { url: "https://github.com/owner/repo/wiki/Home", depth: 1 }; - const result = await (strategy as any).processItem(item, options); + const result = await strategy.processItem(item, options); expect(result.links).toEqual(["https://github.com/owner/repo/wiki/API"]); }); @@ -660,11 +668,13 @@ describe("GitHubWikiScraperStrategy", () => { mimeType: "text/html", source: "https://github.com/owner/repo/wiki/Home", charset: "utf-8", + status: FetchStatus.SUCCESS, }; - const processedContent = { + const processedContent: PipelineResult = { textContent: "Content", - metadata: { title: "Test" }, + title: "Test", + chunks: [], errors: [], links: [ "invalid-url", @@ -679,7 +689,7 @@ describe("GitHubWikiScraperStrategy", () => { htmlPipelineInstance.process.mockResolvedValue(processedContent); const item = { url: "https://github.com/owner/repo/wiki/Home", depth: 1 }; - const result = await (strategy as any).processItem(item, options); + const result = await strategy.processItem(item, options); // Should only include the valid wiki link expect(result.links).toEqual(["https://github.com/owner/repo/wiki/Valid"]); diff --git a/src/scraper/strategies/GitHubWikiScraperStrategy.ts b/src/scraper/strategies/GitHubWikiScraperStrategy.ts index e99336a2..2c62ccab 100644 --- a/src/scraper/strategies/GitHubWikiScraperStrategy.ts +++ b/src/scraper/strategies/GitHubWikiScraperStrategy.ts @@ -1,11 +1,13 @@ -import type { Document, ProgressCallback } from "../../types"; +import type { ProgressCallback } from "../../types"; import { logger } from "../../utils/logger"; import { HttpFetcher } from "../fetcher"; +import { FetchStatus } from "../fetcher/types"; import { PipelineFactory } from "../pipelines/PipelineFactory"; -import type { ContentPipeline } from "../pipelines/types"; -import { ScrapeMode, type ScraperOptions, type ScraperProgress } from "../types"; +import type { ContentPipeline, PipelineResult } from "../pipelines/types"; +import type { QueueItem } from "../types"; +import { ScrapeMode, type ScraperOptions, type ScraperProgressEvent } from "../types"; import { shouldIncludeUrl } from "../utils/patternMatcher"; -import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy"; +import { BaseScraperStrategy, type ProcessItemResult } from "./BaseScraperStrategy"; interface GitHubWikiInfo { owner: string; @@ -93,12 +95,11 @@ export class GitHubWikiScraperStrategy extends BaseScraperStrategy { } } - protected async processItem( + async processItem( item: QueueItem, options: ScraperOptions, - _progressCallback?: ProgressCallback, signal?: AbortSignal, - ): Promise<{ document?: Document; links?: string[] }> { + ): Promise { const currentUrl = item.url; logger.info( @@ -110,10 +111,10 @@ export class GitHubWikiScraperStrategy extends BaseScraperStrategy { const rawContent = await this.httpFetcher.fetch(currentUrl, { signal }); // Process content through appropriate pipeline - let processed: Awaited> | undefined; + let processed: PipelineResult | undefined; for (const pipeline of this.pipelines) { - if (pipeline.canProcess(rawContent)) { + if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) { logger.debug( `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${currentUrl})`, ); @@ -130,10 +131,10 @@ export class GitHubWikiScraperStrategy extends BaseScraperStrategy { logger.warn( `⚠️ Unsupported content type "${rawContent.mimeType}" for wiki page ${currentUrl}. Skipping processing.`, ); - return { document: undefined, links: [] }; + return { url: currentUrl, links: [], status: FetchStatus.SUCCESS }; } - for (const err of processed.errors) { + for (const err of processed.errors ?? []) { logger.warn(`⚠️ Processing error for ${currentUrl}: ${err.message}`); } @@ -145,24 +146,6 @@ export class GitHubWikiScraperStrategy extends BaseScraperStrategy { .replace(/^\//, ""); const pageTitle = wikiPagePath || "Home"; - // Create document with wiki-specific metadata - const document: Document = { - content: typeof processed.textContent === "string" ? processed.textContent : "", - metadata: { - url: currentUrl, - title: - typeof processed.metadata.title === "string" && - processed.metadata.title.trim() !== "" - ? processed.metadata.title - : pageTitle, - library: options.library, - version: options.version, - etag: rawContent.etag, - lastModified: rawContent.lastModified, - }, - contentType: rawContent.mimeType, - }; - // Extract links from the processed content const links = processed.links || []; @@ -202,16 +185,25 @@ export class GitHubWikiScraperStrategy extends BaseScraperStrategy { } }); - return { document, links: wikiLinks }; + return { + url: currentUrl, + title: pageTitle, + etag: rawContent.etag, + lastModified: rawContent.lastModified, + contentType: rawContent.mimeType, + content: processed, + links: wikiLinks, + status: FetchStatus.SUCCESS, + }; } catch (error) { logger.warn(`⚠️ Failed to process wiki page ${currentUrl}: ${error}`); - return { document: undefined, links: [] }; + return { url: currentUrl, links: [], status: FetchStatus.SUCCESS }; } } async scrape( options: ScraperOptions, - progressCallback: ProgressCallback, + progressCallback: ProgressCallback, signal?: AbortSignal, ): Promise { // Validate it's a GitHub wiki URL diff --git a/src/scraper/strategies/LocalFileStrategy.test.ts b/src/scraper/strategies/LocalFileStrategy.test.ts index 0f07a0b9..58d70195 100644 --- a/src/scraper/strategies/LocalFileStrategy.test.ts +++ b/src/scraper/strategies/LocalFileStrategy.test.ts @@ -1,6 +1,7 @@ import { vol } from "memfs"; import { beforeEach, describe, expect, it, vi } from "vitest"; -import type { ScraperOptions } from "../types"; +import type { ProgressCallback } from "../../types"; +import type { ScrapeResult, ScraperOptions, ScraperProgressEvent } from "../types"; import { LocalFileStrategy } from "./LocalFileStrategy"; vi.mock("node:fs/promises", () => ({ default: vol.promises })); @@ -27,7 +28,7 @@ describe("LocalFileStrategy", () => { maxPages: 1, maxDepth: 0, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); vol.fromJSON( { @@ -39,27 +40,37 @@ describe("LocalFileStrategy", () => { await strategy.scrape(options, progressCallback); expect(progressCallback).toHaveBeenCalledTimes(1); - expect(progressCallback).toHaveBeenCalledWith( - expect.objectContaining({ - pagesScraped: 1, - currentUrl: "file:///test.md", - depth: 0, - maxDepth: 0, - totalPages: 1, - document: { - content: "# Test\n\nThis is a test file.", - contentType: "text/markdown", - metadata: expect.objectContaining({ - url: "file:///test.md", - title: "Test", - library: "test", - version: "1.0", - etag: expect.any(String), - lastModified: expect.any(String), - }), - }, - }), - ); + + const firstCall = progressCallback.mock.calls[0][0]; + expect(firstCall).toMatchObject({ + pagesScraped: 1, + currentUrl: "file:///test.md", + depth: 0, + maxDepth: 0, + totalPages: 1, + totalDiscovered: 1, + pageId: undefined, + result: { + textContent: "# Test\n\nThis is a test file.", + contentType: "text/markdown", + url: "file:///test.md", + title: "Test", + links: [], + errors: [], + chunks: [ + { + content: "# Test\nThis is a test file.", // content is simplified + section: { + level: 1, + path: ["Test"], + }, + types: ["heading", "text"], + }, + ], + }, + } satisfies Partial); + expect(firstCall.result?.etag).toBeDefined(); + expect(firstCall.result?.lastModified).toBeDefined(); }); it("should process a directory with files and a subdirectory", async () => { @@ -71,7 +82,7 @@ describe("LocalFileStrategy", () => { maxPages: 10, maxDepth: 2, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); vol.fromJSON( { @@ -98,7 +109,7 @@ describe("LocalFileStrategy", () => { maxDepth: 1, maxConcurrency: 1, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); vol.fromJSON( { "/testdir/file1.md": "# File 1", @@ -110,7 +121,7 @@ describe("LocalFileStrategy", () => { ); await strategy.scrape(options, progressCallback); - // All 3 files are processed: file1.md, file2.html, and file3.txt (as markdown) + // All 3 files are page: file1.md, file2.html, and file3.txt (as markdown) expect(progressCallback).toHaveBeenCalledTimes(3); // Validate .md @@ -122,16 +133,14 @@ describe("LocalFileStrategy", () => { depth: 1, maxDepth: 1, totalPages: 4, - document: expect.objectContaining({ - content: "# File 1", - metadata: expect.objectContaining({ - url: "file:///testdir/file1.md", - title: "File 1", - library: "test", - version: "1.0", - }), - }), - }), + totalDiscovered: 4, + result: expect.objectContaining({ + textContent: "# File 1", + contentType: "text/markdown", + url: "file:///testdir/file1.md", + title: "File 1", + } satisfies Partial), + } satisfies Partial), ); // Validate .html expect(progressCallback).toHaveBeenNthCalledWith( @@ -142,16 +151,14 @@ describe("LocalFileStrategy", () => { depth: 1, maxDepth: 1, totalPages: 4, - document: expect.objectContaining({ - content: expect.stringContaining("# File 2"), - metadata: expect.objectContaining({ - url: "file:///testdir/file2.html", - title: "File 2 Title", - library: "test", - version: "1.0", - }), - }), - }), + totalDiscovered: 4, + result: expect.objectContaining({ + textContent: expect.stringContaining("# File 2"), + contentType: "text/html", + url: "file:///testdir/file2.html", + title: "File 2 Title", + } satisfies Partial), + } satisfies Partial), ); // Validate .txt expect(progressCallback).toHaveBeenNthCalledWith( @@ -162,16 +169,14 @@ describe("LocalFileStrategy", () => { depth: 1, maxDepth: 1, totalPages: 4, - document: expect.objectContaining({ - content: "File 3", - metadata: expect.objectContaining({ - url: "file:///testdir/file3.txt", - title: "Untitled", - library: "test", - version: "1.0", - }), - }), - }), + totalDiscovered: 4, + result: expect.objectContaining({ + textContent: "File 3", + contentType: "text/plain", + url: "file:///testdir/file3.txt", + title: "file3.txt", + } satisfies Partial), + } satisfies Partial), ); }); @@ -185,7 +190,7 @@ describe("LocalFileStrategy", () => { maxDepth: 1, maxConcurrency: 1, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); vol.fromJSON( { @@ -209,105 +214,96 @@ describe("LocalFileStrategy", () => { // Check TypeScript file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ - document: expect.objectContaining({ + result: expect.objectContaining({ + title: "app.ts", + textContent: expect.stringContaining("interface User"), contentType: "text/x-typescript", - content: expect.stringContaining("interface User"), - metadata: expect.objectContaining({ - url: "file:///codebase/app.ts", - }), - }), - }), + url: "file:///codebase/app.ts", + } satisfies Partial), + } satisfies Partial), ); // Check TSX file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ - document: expect.objectContaining({ + result: expect.objectContaining({ + title: "component.tsx", + textContent: expect.stringContaining("export const App"), contentType: "text/x-tsx", - content: expect.stringContaining("export const App"), - metadata: expect.objectContaining({ - url: "file:///codebase/component.tsx", - }), - }), - }), + url: "file:///codebase/component.tsx", + } satisfies Partial), + } satisfies Partial), ); // Check Python file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ - document: expect.objectContaining({ + result: expect.objectContaining({ + title: "script.py", + textContent: expect.stringContaining("def hello"), contentType: "text/x-python", - content: expect.stringContaining("def hello"), - metadata: expect.objectContaining({ - url: "file:///codebase/script.py", - }), - }), - }), + url: "file:///codebase/script.py", + } satisfies Partial), + } satisfies Partial), ); // Check Go file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ - document: expect.objectContaining({ + result: expect.objectContaining({ + title: "main.go", + textContent: expect.stringContaining("package main"), contentType: "text/x-go", - content: expect.stringContaining("package main"), - metadata: expect.objectContaining({ - url: "file:///codebase/main.go", - }), - }), - }), + url: "file:///codebase/main.go", + } satisfies Partial), + } satisfies Partial), ); // Check Rust file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ - document: expect.objectContaining({ + result: expect.objectContaining({ + title: "lib.rs", + textContent: expect.stringContaining("fn main"), contentType: "text/x-rust", - content: expect.stringContaining("fn main"), - metadata: expect.objectContaining({ - url: "file:///codebase/lib.rs", - }), - }), - }), + url: "file:///codebase/lib.rs", + } satisfies Partial), + } satisfies Partial), ); // Check Kotlin file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ - document: expect.objectContaining({ + result: expect.objectContaining({ + title: "App.kt", + textContent: expect.stringContaining("fun main"), contentType: "text/x-kotlin", - content: expect.stringContaining("fun main"), - metadata: expect.objectContaining({ - url: "file:///codebase/App.kt", - }), - }), - }), + url: "file:///codebase/App.kt", + } satisfies Partial), + } satisfies Partial), ); // Check Ruby file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ - document: expect.objectContaining({ + result: expect.objectContaining({ + title: "script.rb", + textContent: expect.stringContaining("puts"), contentType: "text/x-ruby", - content: expect.stringContaining("puts"), - metadata: expect.objectContaining({ - url: "file:///codebase/script.rb", - }), - }), - }), + url: "file:///codebase/script.rb", + } satisfies Partial), + } satisfies Partial), ); // Check Shell script expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ - document: expect.objectContaining({ + result: expect.objectContaining({ + textContent: expect.stringContaining("#!/bin/bash"), contentType: "text/x-shellscript", - content: expect.stringContaining("#!/bin/bash"), - metadata: expect.objectContaining({ - url: "file:///codebase/run.sh", - }), - }), - }), + url: "file:///codebase/run.sh", + } satisfies Partial), + } satisfies Partial), ); }); @@ -321,7 +317,7 @@ describe("LocalFileStrategy", () => { maxDepth: 1, maxConcurrency: 1, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); vol.fromJSON( { "/testdir/empty.md": "", @@ -336,16 +332,13 @@ describe("LocalFileStrategy", () => { expect.objectContaining({ pagesScraped: 1, currentUrl: "file:///testdir/empty.md", - document: expect.objectContaining({ - content: "", - metadata: expect.objectContaining({ - title: "Untitled", - url: "file:///testdir/empty.md", - library: "test", - version: "1.0", - }), - }), - }), + result: expect.objectContaining({ + textContent: "", + contentType: "text/markdown", + title: "Untitled", + url: "file:///testdir/empty.md", + } satisfies Partial), + } satisfies Partial), ); }); @@ -359,7 +352,7 @@ describe("LocalFileStrategy", () => { maxDepth: 1, maxConcurrency: 1, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); // Simulate a binary file (with null bytes) and an image file vol.fromJSON( { @@ -399,7 +392,7 @@ describe("LocalFileStrategy", () => { includePatterns: ["/file1.md", "/file3.txt"], excludePatterns: ["/file3.txt"], // exclude takes precedence }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); vol.fromJSON( { "/testdir/file1.md": "# File 1", // should be included @@ -426,7 +419,7 @@ describe("LocalFileStrategy", () => { maxPages: 1, maxDepth: 0, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); vol.fromJSON( { "/test dir/space file.md": "# Space File\n\nThis file has spaces in its name.", @@ -439,14 +432,13 @@ describe("LocalFileStrategy", () => { expect.objectContaining({ pagesScraped: 1, currentUrl: "file:///test%20dir/space%20file.md", - document: expect.objectContaining({ - content: "# Space File\n\nThis file has spaces in its name.", - metadata: expect.objectContaining({ - url: "file:///test%20dir/space%20file.md", - title: "Space File", - }), - }), - }), + result: expect.objectContaining({ + textContent: "# Space File\n\nThis file has spaces in its name.", + contentType: "text/markdown", + url: "file:///test%20dir/space%20file.md", + title: "Space File", + } satisfies Partial), + } satisfies Partial), ); }); @@ -460,7 +452,7 @@ describe("LocalFileStrategy", () => { maxDepth: 1, maxConcurrency: 1, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); vol.fromJSON( { "/test dir/file with space.md": "# File With Space", @@ -485,7 +477,7 @@ describe("LocalFileStrategy", () => { maxPages: 1, maxDepth: 0, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); // Create a JSON file with API documentation structure const jsonContent = JSON.stringify( @@ -540,17 +532,13 @@ describe("LocalFileStrategy", () => { maxDepth: 0, totalPages: 1, totalDiscovered: 1, - document: expect.objectContaining({ - content: jsonContent, + result: expect.objectContaining({ + textContent: jsonContent, contentType: "application/json", - metadata: expect.objectContaining({ - library: "test-api", - title: "Test API Documentation", - url: "file:///api-docs.json", - version: "1.0.0", - }), - }), - }), + title: "Test API Documentation", + url: "file:///api-docs.json", + } satisfies Partial), + } satisfies Partial), ); }); @@ -564,7 +552,7 @@ describe("LocalFileStrategy", () => { maxDepth: 0, maxConcurrency: 1, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const testContent = "# Test Content\nThis is a test file."; vol.fromJSON( @@ -581,17 +569,294 @@ describe("LocalFileStrategy", () => { expect.objectContaining({ pagesScraped: 1, currentUrl: "file://testdir/test.md", // Original malformed URL preserved - document: expect.objectContaining({ - content: testContent, + result: expect.objectContaining({ + textContent: testContent, contentType: "text/markdown", - metadata: expect.objectContaining({ - title: "Test Content", - url: "file://testdir/test.md", - library: "test", - version: "1.0", - }), - }), - }), + title: "Test Content", + url: "file://testdir/test.md", + } satisfies Partial), + } satisfies Partial), ); }); + + describe("refresh workflow", () => { + it("should skip processing when file returns NOT_MODIFIED (unchanged)", async () => { + const strategy = new LocalFileStrategy(); + const progressCallback = vi.fn>(); + const testContent = "# Test File\nOriginal content"; + + // Create initial file + vol.fromJSON({ "/test.md": testContent }, "/"); + + // First scrape to get the initial etag + const initialOptions: ScraperOptions = { + url: "file:///test.md", + library: "test", + version: "1.0", + maxPages: 1, + maxDepth: 0, + }; + + await strategy.scrape(initialOptions, progressCallback); + expect(progressCallback).toHaveBeenCalledTimes(1); + + // Get the etag from the first scrape + const firstCall = progressCallback.mock.calls[0][0]; + const etag = firstCall.result?.etag; + + // Reset the callback + progressCallback.mockClear(); + + // Now do a refresh with the same etag (file unchanged) + const refreshOptions: ScraperOptions = { + url: "file:///test.md", + library: "test", + version: "1.0", + maxPages: 1, + maxDepth: 0, + initialQueue: [ + { + url: "file:///test.md", + depth: 0, + pageId: 123, + etag: etag, + }, + ], + }; + + await strategy.scrape(refreshOptions, progressCallback); + + // Verify no documents were processed (file unchanged) + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(0); + }); + + it("should re-process file when it has been modified", async () => { + const strategy = new LocalFileStrategy(); + const progressCallback = vi.fn>(); + const originalContent = "# Original\nOriginal content"; + const updatedContent = "# Updated\nNew updated content"; + + // Create initial file + vol.fromJSON({ "/test.md": originalContent }, "/"); + + // First scrape + const initialOptions: ScraperOptions = { + url: "file:///test.md", + library: "test", + version: "1.0", + maxPages: 1, + maxDepth: 0, + }; + + await strategy.scrape(initialOptions, progressCallback); + const firstCall = progressCallback.mock.calls[0][0]; + const oldEtag = firstCall.result?.etag; + + // Modify the file (update content and mtime) + // Using a new date for fromJSON will create a new mtime + vol.reset(); + vol.fromJSON({ "/test.md": updatedContent }, "/"); + + // Wait a bit to ensure different mtime + await new Promise((resolve) => setTimeout(resolve, 10)); + + progressCallback.mockClear(); + + // Refresh with old etag + const refreshOptions: ScraperOptions = { + url: "file:///test.md", + library: "test", + version: "1.0", + maxPages: 1, + maxDepth: 0, + initialQueue: [ + { + url: "file:///test.md", + depth: 0, + pageId: 456, + etag: oldEtag, + }, + ], + }; + + await strategy.scrape(refreshOptions, progressCallback); + + // Verify file was re-processed + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(1); + expect(docCalls[0][0].result?.textContent).toContain("# Updated"); + expect(docCalls[0][0].result?.textContent).toContain("New updated content"); + expect(docCalls[0][0].result?.title).toBe("Updated"); + // Verify new etag is different + expect(docCalls[0][0].result?.etag).not.toBe(oldEtag); + }); + + it("should handle deleted files during refresh", async () => { + const strategy = new LocalFileStrategy(); + const progressCallback = vi.fn>(); + const testContent = "# Test File\nContent"; + + // Create initial file + vol.fromJSON({ "/test.md": testContent }, "/"); + + // First scrape + const initialOptions: ScraperOptions = { + url: "file:///test.md", + library: "test", + version: "1.0", + maxPages: 1, + maxDepth: 0, + }; + + await strategy.scrape(initialOptions, progressCallback); + const firstCall = progressCallback.mock.calls[0][0]; + const etag = firstCall.result?.etag; + + // Delete the file + vol.reset(); + + progressCallback.mockClear(); + + // Refresh with deleted file + const refreshOptions: ScraperOptions = { + url: "file:///test.md", + library: "test", + version: "1.0", + maxPages: 1, + maxDepth: 0, + initialQueue: [ + { + url: "file:///test.md", + depth: 0, + pageId: 789, + etag: etag, + }, + ], + }; + + await strategy.scrape(refreshOptions, progressCallback); + + // Verify no processed documents were returned + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(0); + }); + + it("should discover and process new files in a directory during refresh", async () => { + const strategy = new LocalFileStrategy(); + const progressCallback = vi.fn>(); + + // Create initial directory with one file + vol.fromJSON( + { + "/testdir/file1.md": "# File 1", + }, + "/", + ); + + // First scrape + const initialOptions: ScraperOptions = { + url: "file:///testdir", + library: "test", + version: "1.0", + maxPages: 10, + maxDepth: 1, + }; + + await strategy.scrape(initialOptions, progressCallback); + expect(progressCallback).toHaveBeenCalledTimes(1); + + // Add a new file to the directory + vol.fromJSON( + { + "/testdir/file1.md": "# File 1", + "/testdir/file2.md": "# File 2\nNew file added", + }, + "/", + ); + + progressCallback.mockClear(); + + // Refresh the directory (directories don't use etag, they just re-scan) + const refreshOptions: ScraperOptions = { + url: "file:///testdir", + library: "test", + version: "1.0", + maxPages: 10, + maxDepth: 1, + }; + + await strategy.scrape(refreshOptions, progressCallback); + + // Should process both files + expect(progressCallback).toHaveBeenCalledTimes(2); + const calledUrls = progressCallback.mock.calls.map((call) => call[0].currentUrl); + expect(calledUrls).toContain("file:///testdir/file1.md"); + expect(calledUrls).toContain("file:///testdir/file2.md"); + }); + + it("should preserve depth from original scrape during refresh", async () => { + const strategy = new LocalFileStrategy(); + const progressCallback = vi.fn>(); + const testContent = "# Deep File\nContent at depth 2"; + + vol.fromJSON( + { + "/deep/file.md": testContent, + }, + "/", + ); + + // First scrape to get etag + const initialOptions: ScraperOptions = { + url: "file:///deep/file.md", + library: "test", + version: "1.0", + maxPages: 1, + maxDepth: 2, + }; + + await strategy.scrape(initialOptions, progressCallback); + const firstCall = progressCallback.mock.calls[0][0]; + const etag = firstCall.result?.etag; + + // Update the file with new content + vol.reset(); + vol.fromJSON( + { + "/deep/file.md": "# Deep File\nUpdated content", + }, + "/", + ); + + // Wait a bit to ensure different mtime + await new Promise((resolve) => setTimeout(resolve, 10)); + + progressCallback.mockClear(); + + // Refresh with original depth + const refreshOptions: ScraperOptions = { + url: "file:///deep/file.md", + library: "test", + version: "1.0", + maxPages: 1, + maxDepth: 2, + initialQueue: [ + { + url: "file:///deep/file.md", + depth: 2, // Original depth preserved + pageId: 555, + etag: etag, + }, + ], + }; + + await strategy.scrape(refreshOptions, progressCallback); + + // Verify depth is preserved + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(1); + expect(docCalls[0][0].depth).toBe(2); + }); + }); }); diff --git a/src/scraper/strategies/LocalFileStrategy.ts b/src/scraper/strategies/LocalFileStrategy.ts index 0972b8c2..723a8cef 100644 --- a/src/scraper/strategies/LocalFileStrategy.ts +++ b/src/scraper/strategies/LocalFileStrategy.ts @@ -1,13 +1,12 @@ import fs from "node:fs/promises"; import path from "node:path"; -import type { Document, ProgressCallback } from "../../types"; import { logger } from "../../utils/logger"; import { FileFetcher } from "../fetcher"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import { PipelineFactory } from "../pipelines/PipelineFactory"; -import type { ContentPipeline } from "../pipelines/types"; -import type { ScraperOptions, ScraperProgress } from "../types"; -import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy"; +import type { ContentPipeline, PipelineResult } from "../pipelines/types"; +import type { QueueItem, ScraperOptions } from "../types"; +import { BaseScraperStrategy, type ProcessItemResult } from "./BaseScraperStrategy"; /** * LocalFileStrategy handles crawling and scraping of local files and folders using file:// URLs. @@ -29,12 +28,11 @@ export class LocalFileStrategy extends BaseScraperStrategy { return url.startsWith("file://"); } - protected async processItem( + async processItem( item: QueueItem, options: ScraperOptions, - _progressCallback?: ProgressCallback, _signal?: AbortSignal, - ): Promise<{ document?: Document; links?: string[] }> { + ): Promise { // Parse the file URL properly to handle both file:// and file:/// formats let filePath = item.url.replace(/^file:\/\/\/?/, ""); filePath = decodeURIComponent(filePath); @@ -44,7 +42,21 @@ export class LocalFileStrategy extends BaseScraperStrategy { filePath = `/${filePath}`; } - const stats = await fs.stat(filePath); + let stats: Awaited>; + try { + stats = await fs.stat(filePath); + } catch (error) { + // File not found + if ((error as NodeJS.ErrnoException).code === "ENOENT") { + logger.info(`✓ File deleted or not available: ${filePath}`); + return { + url: item.url, + links: [], + status: FetchStatus.NOT_FOUND, + }; + } + throw error; + } if (stats.isDirectory()) { const contents = await fs.readdir(filePath); @@ -52,17 +64,25 @@ export class LocalFileStrategy extends BaseScraperStrategy { const links = contents .map((name) => `file://${path.join(filePath, name)}`) .filter((url) => this.shouldProcessUrl(url, options)); - return { links }; + return { url: item.url, links, status: FetchStatus.SUCCESS }; } logger.info(`🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`); - const rawContent: RawContent = await this.fileFetcher.fetch(item.url); + const rawContent: RawContent = await this.fileFetcher.fetch(item.url, { + etag: item.etag, + }); - let processed: Awaited> | undefined; + // Handle NOT_MODIFIED status (file hasn't changed) + if (rawContent.status === FetchStatus.NOT_MODIFIED) { + logger.debug(`✓ File unchanged: ${filePath}`); + return { url: rawContent.source, links: [], status: FetchStatus.NOT_MODIFIED }; + } + + let processed: PipelineResult | undefined; for (const pipeline of this.pipelines) { - if (pipeline.canProcess(rawContent)) { + if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) { logger.debug( `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`, ); @@ -75,29 +95,28 @@ export class LocalFileStrategy extends BaseScraperStrategy { logger.warn( `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`, ); - return { document: undefined, links: [] }; + return { url: rawContent.source, links: [], status: FetchStatus.SUCCESS }; } - for (const err of processed.errors) { + for (const err of processed.errors ?? []) { logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`); } + // Use filename as fallback if title is empty or not a string + const filename = path.basename(filePath); + const title = processed.title?.trim() || filename || null; + + // For local files, we don't follow links (no crawling within file content) + // Return empty links array return { - document: { - content: typeof processed.textContent === "string" ? processed.textContent : "", - contentType: rawContent.mimeType, - metadata: { - url: rawContent.source, - title: - typeof processed.metadata.title === "string" - ? processed.metadata.title - : "Untitled", - library: options.library, - version: options.version, - etag: rawContent.etag, - lastModified: rawContent.lastModified, - }, - } satisfies Document, + url: rawContent.source, + title: title, + etag: rawContent.etag, + lastModified: rawContent.lastModified, + contentType: rawContent.mimeType, + content: processed, + links: [], + status: FetchStatus.SUCCESS, }; } diff --git a/src/scraper/strategies/NpmScraperStrategy.ts b/src/scraper/strategies/NpmScraperStrategy.ts index 336b74ab..ce0bd199 100644 --- a/src/scraper/strategies/NpmScraperStrategy.ts +++ b/src/scraper/strategies/NpmScraperStrategy.ts @@ -1,5 +1,5 @@ import type { ProgressCallback } from "../../types"; -import type { ScraperOptions, ScraperProgress, ScraperStrategy } from "../types"; +import type { ScraperOptions, ScraperProgressEvent, ScraperStrategy } from "../types"; import { WebScraperStrategy } from "./WebScraperStrategy"; export class NpmScraperStrategy implements ScraperStrategy { @@ -23,7 +23,7 @@ export class NpmScraperStrategy implements ScraperStrategy { async scrape( options: ScraperOptions, - progressCallback: ProgressCallback, + progressCallback: ProgressCallback, signal?: AbortSignal, ): Promise { // Use default strategy with our configuration, passing the signal diff --git a/src/scraper/strategies/PyPiScraperStrategy.ts b/src/scraper/strategies/PyPiScraperStrategy.ts index abe31f7b..50360654 100644 --- a/src/scraper/strategies/PyPiScraperStrategy.ts +++ b/src/scraper/strategies/PyPiScraperStrategy.ts @@ -1,5 +1,5 @@ import type { ProgressCallback } from "../../types"; -import type { ScraperOptions, ScraperProgress, ScraperStrategy } from "../types"; +import type { ScraperOptions, ScraperProgressEvent, ScraperStrategy } from "../types"; import { WebScraperStrategy } from "./WebScraperStrategy"; export class PyPiScraperStrategy implements ScraperStrategy { @@ -23,7 +23,7 @@ export class PyPiScraperStrategy implements ScraperStrategy { async scrape( options: ScraperOptions, - progressCallback: ProgressCallback, + progressCallback: ProgressCallback, signal?: AbortSignal, ): Promise { // Use default strategy with our configuration, passing the signal diff --git a/src/scraper/strategies/WebScraperStrategy.test.ts b/src/scraper/strategies/WebScraperStrategy.test.ts index e28f1d8b..8cc8cdaf 100644 --- a/src/scraper/strategies/WebScraperStrategy.test.ts +++ b/src/scraper/strategies/WebScraperStrategy.test.ts @@ -1,6 +1,7 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; -import type { Document } from "../../types"; -import type { ScraperOptions } from "../types"; +import type { ProgressCallback } from "../../types"; +import { FetchStatus } from "../fetcher/types"; +import type { ScrapeResult, ScraperOptions, ScraperProgressEvent } from "../types"; import { ScrapeMode } from "../types"; // Import ScrapeMode import { WebScraperStrategy } from "./WebScraperStrategy"; @@ -32,6 +33,7 @@ describe("WebScraperStrategy", () => { content: "

Default Mock Content

", mimeType: "text/html", source: "https://example.com", // Default source + status: FetchStatus.SUCCESS, }); // Create a fresh instance of the strategy for each test @@ -67,7 +69,7 @@ describe("WebScraperStrategy", () => { }, 10000); it("should use HttpFetcher to fetch content and process result", async () => { - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const testUrl = "https://example.com"; options.url = testUrl; // Ensure options match @@ -77,6 +79,7 @@ describe("WebScraperStrategy", () => { content: `${expectedTitle}

Fetched Content

`, mimeType: "text/html", source: testUrl, + status: FetchStatus.SUCCESS, }); await strategy.scrape(options, progressCallback); @@ -90,17 +93,17 @@ describe("WebScraperStrategy", () => { // Verify that the pipeline processed and called the callback with a document expect(progressCallback).toHaveBeenCalled(); const documentProcessingCall = progressCallback.mock.calls.find( - (call) => call[0].document, + (call) => call[0].result, ); expect(documentProcessingCall).toBeDefined(); // Use non-null assertion operator (!) since we've asserted it's defined - expect(documentProcessingCall![0].document.content).toBe("# Fetched Content"); // Check processed markdown (from H1) - expect(documentProcessingCall![0].document.metadata.title).toBe(expectedTitle); // Check extracted title (from ) + expect(documentProcessingCall![0].result?.textContent).toBe("# Fetched Content"); // Check processed markdown (from H1) + expect(documentProcessingCall![0].result?.title).toBe(expectedTitle); // Check extracted title (from <title>) }, 10000); it("should respect the followRedirects option", async () => { options.followRedirects = false; - const progressCallback = vi.fn(); + const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); await strategy.scrape(options, progressCallback); @@ -112,7 +115,7 @@ describe("WebScraperStrategy", () => { // Also check that processing still happened expect(progressCallback).toHaveBeenCalled(); const documentProcessingCall = progressCallback.mock.calls.find( - (call) => call[0].document, + (call) => call[0].result, ); expect(documentProcessingCall).toBeDefined(); }, 10000); @@ -134,19 +137,25 @@ describe("WebScraperStrategy", () => { mockFetchFn.mockImplementation(async (url: string) => { if (url === "https://example.com") - return { content: baseHtml, mimeType: "text/html", source: url }; + return { + content: baseHtml, + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + }; // Return simple content for subpages, title reflects URL return { content: `<html><head><title>${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); options.scope = "subpages"; options.maxDepth = 1; // Limit depth for simplicity options.maxPages = 5; // Allow enough pages - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); @@ -174,22 +183,17 @@ describe("WebScraperStrategy", () => { ); // Verify documents via callback - const receivedDocs = progressCallback.mock.calls - .map((call) => call[0].document) - .filter((doc): doc is Document => doc !== undefined); // Type guard - + const receivedDocs = progressCallback.mock.calls.map((call) => call[0].result); expect(receivedDocs).toHaveLength(4); - expect(receivedDocs.some((doc) => doc.metadata.title === "Test Site")).toBe(true); + expect(receivedDocs.some((doc) => doc?.title === "Test Site")).toBe(true); expect( - receivedDocs.some((doc) => doc.metadata.title === "https://example.com/subpage1"), + receivedDocs.some((doc) => doc?.title === "https://example.com/subpage1"), ).toBe(true); expect( - receivedDocs.some((doc) => doc.metadata.title === "https://example.com/subpage2/"), + receivedDocs.some((doc) => doc?.title === "https://example.com/subpage2/"), ).toBe(true); expect( - receivedDocs.some( - (doc) => doc.metadata.title === "https://example.com/relative-path", - ), + receivedDocs.some((doc) => doc?.title === "https://example.com/relative-path"), ).toBe(true); }, 10000); @@ -201,18 +205,20 @@ describe("WebScraperStrategy", () => { 'BaseSubAPIOther', mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); options.scope = "hostname"; options.maxDepth = 1; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); @@ -229,14 +235,12 @@ describe("WebScraperStrategy", () => { expect(mockFetchFn).not.toHaveBeenCalledWith("https://other.com", expect.anything()); // Verify documents via callback - const receivedDocs = progressCallback.mock.calls - .map((call) => call[0].document) - .filter((doc): doc is Document => doc !== undefined); + const receivedDocs = progressCallback.mock.calls.map((call) => call[0].result); expect(receivedDocs).toHaveLength(2); - expect(receivedDocs.some((doc) => doc.metadata.title === "Base")).toBe(true); - expect( - receivedDocs.some((doc) => doc.metadata.title === "https://example.com/subpage"), - ).toBe(true); + expect(receivedDocs.some((doc) => doc?.title === "Base")).toBe(true); + expect(receivedDocs.some((doc) => doc?.title === "https://example.com/subpage")).toBe( + true, + ); }, 10000); it("should follow links based on scope=domain", async () => { @@ -247,18 +251,20 @@ describe("WebScraperStrategy", () => { 'BaseSubAPIOther', mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); options.scope = "domain"; options.maxDepth = 1; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); @@ -275,17 +281,15 @@ describe("WebScraperStrategy", () => { expect(mockFetchFn).not.toHaveBeenCalledWith("https://other.com", expect.anything()); // Verify documents via callback - const receivedDocs = progressCallback.mock.calls - .map((call) => call[0].document) - .filter((doc): doc is Document => doc !== undefined); + const receivedDocs = progressCallback.mock.calls.map((call) => call[0].result); expect(receivedDocs).toHaveLength(3); - expect(receivedDocs.some((doc) => doc.metadata.title === "Base")).toBe(true); - expect( - receivedDocs.some((doc) => doc.metadata.title === "https://example.com/subpage"), - ).toBe(true); - expect( - receivedDocs.some((doc) => doc.metadata.title === "https://api.example.com/ep"), - ).toBe(true); + expect(receivedDocs.some((doc) => doc?.title === "Base")).toBe(true); + expect(receivedDocs.some((doc) => doc?.title === "https://example.com/subpage")).toBe( + true, + ); + expect(receivedDocs.some((doc) => doc?.title === "https://api.example.com/ep")).toBe( + true, + ); }, 10000); // --- Limit Tests --- @@ -300,6 +304,7 @@ describe("WebScraperStrategy", () => { 'L0L1', mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } if (url === "https://example.com/level1") { @@ -309,6 +314,7 @@ describe("WebScraperStrategy", () => { 'L1L2', mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } if (url === "https://example.com/level2") { @@ -318,6 +324,7 @@ describe("WebScraperStrategy", () => { 'L2L3', mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } // Default for unexpected calls @@ -325,11 +332,12 @@ describe("WebScraperStrategy", () => { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); options.maxDepth = 1; // Limit depth - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); @@ -345,12 +353,10 @@ describe("WebScraperStrategy", () => { ); // Exceeds depth // Verify documents via callback - const receivedDocs = progressCallback.mock.calls - .map((call) => call[0].document) - .filter((doc): doc is Document => doc !== undefined); + const receivedDocs = progressCallback.mock.calls.map((call) => call[0].result); expect(receivedDocs).toHaveLength(2); // Base (L0) + L1 - expect(receivedDocs.some((doc) => doc.metadata.title === "L0")).toBe(true); - expect(receivedDocs.some((doc) => doc.metadata.title === "L1")).toBe(true); + expect(receivedDocs.some((doc) => doc?.title === "L0")).toBe(true); + expect(receivedDocs.some((doc) => doc?.title === "L1")).toBe(true); }, 10000); it("should respect maxPages option", async () => { @@ -362,17 +368,19 @@ describe("WebScraperStrategy", () => { 'Base123', mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); options.maxPages = 2; // Limit pages - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); @@ -396,9 +404,7 @@ describe("WebScraperStrategy", () => { expect(subpagesFetchedCount).toBe(1); // Exactly one subpage fetched // Verify documents via callback - const receivedDocs = progressCallback.mock.calls - .map((call) => call[0].document) - .filter((doc): doc is Document => doc !== undefined); + const receivedDocs = progressCallback.mock.calls.map((call) => call[0].result); expect(receivedDocs).toHaveLength(2); // Base + 1 subpage }, 10000); @@ -413,23 +419,25 @@ describe("WebScraperStrategy", () => { 'Base12', mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); options.maxPages = 3; // Allow all pages options.maxDepth = 1; await strategy.scrape(options, progressCallback); // Verify callback calls - const callsWithDocs = progressCallback.mock.calls.filter((call) => call[0].document); + const callsWithDocs = progressCallback.mock.calls.filter((call) => call[0].result); expect(callsWithDocs).toHaveLength(3); // Base + page1 + page2 // Check structure of a progress call with a document @@ -439,19 +447,15 @@ describe("WebScraperStrategy", () => { currentUrl: expect.any(String), depth: expect.any(Number), maxDepth: options.maxDepth, - document: expect.objectContaining({ - content: expect.any(String), - metadata: expect.objectContaining({ - url: expect.any(String), - title: expect.any(String), // Title comes from pipeline now - library: options.library, - version: options.version, - }), - }), - }); + result: expect.objectContaining({ + textContent: expect.any(String), + url: expect.any(String), + title: expect.any(String), + } satisfies Partial), + } satisfies Partial); // Check specific URLs reported - const reportedUrls = callsWithDocs.map((call) => call[0].document.metadata.url); + const reportedUrls = callsWithDocs.map((call) => call[0].result?.url); expect(reportedUrls).toEqual( expect.arrayContaining([ "https://example.com", @@ -477,9 +481,10 @@ describe("WebScraperStrategy", () => { content: `${expectedTitle}

Processed Content

`, mimeType: "text/html", source: urlWithCreds, + status: FetchStatus.SUCCESS, }); - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); // Ensure fetch was called with the credentialed URL @@ -488,14 +493,14 @@ describe("WebScraperStrategy", () => { expect.objectContaining({ followRedirects: true }), ); // Ensure a document was produced with the expected markdown and title - const docCall = progressCallback.mock.calls.find((call) => call[0].document); + const docCall = progressCallback.mock.calls.find((call) => call[0].result); expect(docCall).toBeDefined(); - expect(docCall![0].document.content).toContain(expectedMarkdown); - expect(docCall![0].document.metadata.title).toBe(expectedTitle); + expect(docCall![0].result?.textContent).toContain(expectedMarkdown); + expect(docCall![0].result?.title).toBe(expectedTitle); }, 10000); // Keep timeout for consistency but test should run quickly with fetch mode it("should forward custom headers to HttpFetcher", async () => { - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const testUrl = "https://example.com"; options.url = testUrl; options.headers = { @@ -506,6 +511,7 @@ describe("WebScraperStrategy", () => { content: "Header Test", mimeType: "text/html", source: testUrl, + status: FetchStatus.SUCCESS, }); await strategy.scrape(options, progressCallback); expect(mockFetchFn).toHaveBeenCalledWith( @@ -521,7 +527,7 @@ describe("WebScraperStrategy", () => { describe("pipeline selection", () => { it("should process HTML content through HtmlPipeline", async () => { - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const testUrl = "https://example.com"; options.url = testUrl; @@ -530,19 +536,20 @@ describe("WebScraperStrategy", () => { "HTML Test

HTML Content

", mimeType: "text/html", source: testUrl, + status: FetchStatus.SUCCESS, }); await strategy.scrape(options, progressCallback); // Verify HTML content was processed (converted to markdown) - const docCall = progressCallback.mock.calls.find((call) => call[0].document); + const docCall = progressCallback.mock.calls.find((call) => call[0].result); expect(docCall).toBeDefined(); - expect(docCall![0].document.content).toContain("# HTML Content"); - expect(docCall![0].document.metadata.title).toBe("HTML Test"); + expect(docCall![0].result?.textContent).toContain("# HTML Content"); + expect(docCall![0].result?.title).toBe("HTML Test"); }); it("should process markdown content through MarkdownPipeline", async () => { - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const testUrl = "https://example.com/readme.md"; options.url = testUrl; @@ -551,19 +558,22 @@ describe("WebScraperStrategy", () => { content: markdownContent, mimeType: "text/markdown", source: testUrl, + status: FetchStatus.SUCCESS, }); await strategy.scrape(options, progressCallback); // Verify markdown content was processed - const docCall = progressCallback.mock.calls.find((call) => call[0].document); + const docCall = progressCallback.mock.calls.find((call) => call[0].result); expect(docCall).toBeDefined(); - expect(docCall![0].document.content).toContain("# Markdown Title"); - expect(docCall![0].document.content).toContain("This is already markdown content."); + expect(docCall![0].result?.textContent).toContain("# Markdown Title"); + expect(docCall![0].result?.textContent).toContain( + "This is already markdown content.", + ); }); it("should skip unsupported content types", async () => { - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const testUrl = "https://example.com/image.png"; options.url = testUrl; @@ -571,19 +581,20 @@ describe("WebScraperStrategy", () => { content: Buffer.from([0x89, 0x50, 0x4e, 0x47]), // PNG header mimeType: "image/png", source: testUrl, + status: FetchStatus.SUCCESS, }); await strategy.scrape(options, progressCallback); // Verify no document was produced for unsupported content - const docCall = progressCallback.mock.calls.find((call) => call[0].document); + const docCall = progressCallback.mock.calls.find((call) => call[0].result); expect(docCall).toBeUndefined(); }); }); describe("error handling", () => { it("should handle fetch failures gracefully", async () => { - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const testUrl = "https://example.com/error"; options.url = testUrl; @@ -595,12 +606,12 @@ describe("WebScraperStrategy", () => { ); // Verify no documents were processed - const docCalls = progressCallback.mock.calls.filter((call) => call[0].document); + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); expect(docCalls).toHaveLength(0); }); it("should handle empty content gracefully", async () => { - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const testUrl = "https://example.com/empty"; options.url = testUrl; @@ -608,6 +619,7 @@ describe("WebScraperStrategy", () => { content: "", // Empty content mimeType: "text/html", source: testUrl, + status: FetchStatus.SUCCESS, }); await strategy.scrape(options, progressCallback); @@ -640,17 +652,19 @@ describe("WebScraperStrategy", () => { `, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); options.maxDepth = 1; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await customStrategy.scrape(options, progressCallback); @@ -673,9 +687,7 @@ describe("WebScraperStrategy", () => { ); // Verify documents were produced for allowed pages - const receivedDocs = progressCallback.mock.calls - .map((call) => call[0].document) - .filter((doc): doc is Document => doc !== undefined); + const receivedDocs = progressCallback.mock.calls.map((call) => call[0].result); expect(receivedDocs).toHaveLength(3); // Base + 2 allowed pages }); }); @@ -695,12 +707,14 @@ describe("WebScraperStrategy", () => { content: `Link`, mimeType: "text/html", source: canonical, // Final URL after redirect + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); @@ -708,7 +722,7 @@ describe("WebScraperStrategy", () => { options.maxDepth = 1; options.maxPages = 5; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); expect(mockFetchFn).toHaveBeenCalledWith(original, expect.anything()); @@ -732,12 +746,14 @@ describe("WebScraperStrategy", () => { `, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); @@ -746,7 +762,7 @@ describe("WebScraperStrategy", () => { options.maxDepth = 1; options.maxPages = 5; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); expect(mockFetchFn).toHaveBeenCalledWith(start, expect.anything()); @@ -771,12 +787,14 @@ describe("WebScraperStrategy", () => { `, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); @@ -785,7 +803,7 @@ describe("WebScraperStrategy", () => { options.maxDepth = 1; options.maxPages = 10; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); expect(mockFetchFn).toHaveBeenCalledWith(start, expect.anything()); @@ -805,12 +823,14 @@ describe("WebScraperStrategy", () => { content: `Nested`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); @@ -819,7 +839,7 @@ describe("WebScraperStrategy", () => { options.maxDepth = 1; options.maxPages = 5; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); expect(mockFetchFn).toHaveBeenCalledWith(startDir, expect.anything()); @@ -838,6 +858,7 @@ describe("WebScraperStrategy", () => { content: `Script`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } // Any unexpected fetches return generic content @@ -845,6 +866,7 @@ describe("WebScraperStrategy", () => { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); @@ -853,7 +875,7 @@ describe("WebScraperStrategy", () => { options.maxDepth = 1; options.maxPages = 5; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); // Should fetch only the start page; the cross-origin (different hostname) base-derived link is filtered out @@ -866,14 +888,16 @@ describe("WebScraperStrategy", () => { const strategy = new WebScraperStrategy(); // Spy on the close method of all pipelines - (strategy as any).pipelines.forEach((pipeline: any) => { + // @ts-expect-error - pipelines is private, but we need to access it for testing + strategy.pipelines.forEach((pipeline: any) => { vi.spyOn(pipeline, "close"); }); await strategy.cleanup(); // Verify close was called on all pipelines - (strategy as any).pipelines.forEach((pipeline: any) => { + // @ts-expect-error - pipelines is private, but we need to access it for testing + strategy.pipelines.forEach((pipeline: any) => { expect(pipeline.close).toHaveBeenCalledOnce(); }); }); @@ -882,7 +906,8 @@ describe("WebScraperStrategy", () => { const strategy = new WebScraperStrategy(); // Mock one pipeline to throw an error during cleanup - vi.spyOn((strategy as any).pipelines[0], "close").mockRejectedValue( + // @ts-expect-error - pipelines is private, but we need to access it for testing + vi.spyOn(strategy.pipelines[0], "close").mockRejectedValue( new Error("Pipeline cleanup failed"), ); @@ -898,4 +923,296 @@ describe("WebScraperStrategy", () => { await expect(strategy.cleanup()).resolves.not.toThrow(); }); }); + + describe("refresh workflow", () => { + beforeEach(() => { + vi.resetAllMocks(); + mockFetchFn.mockResolvedValue({ + content: "

Default Mock Content

", + mimeType: "text/html", + source: "https://example.com", + status: FetchStatus.SUCCESS, + }); + strategy = new WebScraperStrategy(); + options = { + url: "https://example.com", + library: "test", + version: "1.0", + maxPages: 99, + maxDepth: 3, + scope: "subpages", + followRedirects: true, + scrapeMode: ScrapeMode.Fetch, + }; + }); + + it("should skip processing when page returns 304 Not Modified", async () => { + const progressCallback = vi.fn>(); + + // Configure mock to return 304 for a refresh operation + mockFetchFn.mockResolvedValue({ + content: "", + mimeType: "text/html", + source: "https://example.com/page1", + status: FetchStatus.NOT_MODIFIED, + }); + + // Create a queue item with pageId and etag (refresh operation) + options.initialQueue = [ + { + url: "https://example.com/page1", + depth: 0, + pageId: 123, + etag: "existing-etag", + }, + ]; + + await strategy.scrape(options, progressCallback); + + // Verify fetch was called with etag + expect(mockFetchFn).toHaveBeenCalledWith( + "https://example.com/page1", + expect.objectContaining({ + etag: "existing-etag", + }), + ); + + // Verify no documents were processed (304 means unchanged) + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(0); + }); + + it("should report deleted flag when page returns 404 Not Found during refresh", async () => { + const progressCallback = vi.fn>(); + + // Configure mock to return 404 + mockFetchFn.mockResolvedValue({ + content: "", + mimeType: "text/html", + source: "https://example.com/deleted-page", + status: FetchStatus.NOT_FOUND, + }); + + // Create a queue item with pageId and etag (refresh operation) + options.initialQueue = [ + { + url: "https://example.com/deleted-page", + depth: 0, + pageId: 456, + etag: "old-etag", + }, + ]; + + await strategy.scrape(options, progressCallback); + + // Verify fetch was called + expect(mockFetchFn).toHaveBeenCalledWith( + "https://example.com/deleted-page", + expect.objectContaining({ + etag: "old-etag", + }), + ); + + // Verify no processed documents were returned + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(0); + }); + + it("should refresh page content when page returns 200 OK", async () => { + const progressCallback = vi.fn>(); + const updatedContent = + "Updated

New Content

"; + + // Configure mock to return 200 with new content + mockFetchFn.mockResolvedValue({ + content: updatedContent, + mimeType: "text/html", + source: "https://example.com/updated-page", + status: FetchStatus.SUCCESS, + etag: "new-etag", + }); + + // Create a queue item with pageId and etag (refresh operation) + options.initialQueue = [ + { + url: "https://example.com/updated-page", + depth: 0, + pageId: 789, + etag: "old-etag", + }, + ]; + + await strategy.scrape(options, progressCallback); + + // Verify fetch was called with old etag + expect(mockFetchFn).toHaveBeenCalledWith( + "https://example.com/updated-page", + expect.objectContaining({ + etag: "old-etag", + }), + ); + + // Verify new content was processed + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(1); + expect(docCalls[0][0].result?.textContent).toContain("# New Content"); + expect(docCalls[0][0].result?.title).toBe("Updated"); + expect(docCalls[0][0].result?.etag).toBe("new-etag"); + }); + + it("should not follow links during refresh operations", async () => { + const progressCallback = vi.fn>(); + const contentWithLinks = ` + + Refreshed Page + +

Content

+ New Link + Another New Link + + + `; + + // Configure mock to return 200 with new links + mockFetchFn.mockResolvedValue({ + content: contentWithLinks, + mimeType: "text/html", + source: "https://example.com/page-with-links", + status: FetchStatus.SUCCESS, + etag: "new-etag", + }); + + // Create a queue item with pageId and etag (refresh operation) + options.initialQueue = [ + { + url: "https://example.com/page-with-links", + depth: 0, + pageId: 999, + etag: "old-etag", + }, + ]; + + await strategy.scrape(options, progressCallback); + + // Verify only the initial page was fetched (no link following) + expect(mockFetchFn).toHaveBeenCalledTimes(1); + expect(mockFetchFn).toHaveBeenCalledWith( + "https://example.com/page-with-links", + expect.anything(), + ); + + // Verify the new links were not followed + expect(mockFetchFn).not.toHaveBeenCalledWith( + "https://example.com/new-link", + expect.anything(), + ); + expect(mockFetchFn).not.toHaveBeenCalledWith( + "https://example.com/another-new-link", + expect.anything(), + ); + }); + + it("should process multiple pages in a refresh operation with mixed statuses", async () => { + const progressCallback = vi.fn>(); + + // Configure mock to return different statuses for different URLs + mockFetchFn.mockImplementation(async (url: string) => { + if (url === "https://example.com/unchanged") { + return { + content: "", + mimeType: "text/html", + source: url, + status: FetchStatus.NOT_MODIFIED, + }; + } + if (url === "https://example.com/deleted") { + return { + content: "", + mimeType: "text/html", + source: url, + status: FetchStatus.NOT_FOUND, + }; + } + if (url === "https://example.com/updated") { + return { + content: + "Updated

New

", + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + etag: "new-etag", + }; + } + return { + content: "Default", + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + }; + }); + + // Create a queue with multiple pages + options.initialQueue = [ + { + url: "https://example.com/unchanged", + depth: 0, + pageId: 1, + etag: "etag-1", + }, + { + url: "https://example.com/deleted", + depth: 0, + pageId: 2, + etag: "etag-2", + }, + { + url: "https://example.com/updated", + depth: 0, + pageId: 3, + etag: "etag-3", + }, + ]; + + await strategy.scrape(options, progressCallback); + + // Verify all three pages were fetched + expect(mockFetchFn).toHaveBeenCalledTimes(3); + + // Verify only the updated page produced a processed document + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(1); + expect(docCalls[0][0].result?.url).toBe("https://example.com/updated"); + expect(docCalls[0][0].result?.title).toBe("Updated"); + }); + + it("should preserve depth from original scrape during refresh", async () => { + const progressCallback = vi.fn>(); + + mockFetchFn.mockResolvedValue({ + content: + "Depth Test

Content

", + mimeType: "text/html", + source: "https://example.com/deep-page", + status: FetchStatus.SUCCESS, + etag: "new-etag", + }); + + // Create a queue item with depth from original scrape + options.initialQueue = [ + { + url: "https://example.com/deep-page", + depth: 2, // This page was originally scraped at depth 2 + pageId: 555, + etag: "old-etag", + }, + ]; + + await strategy.scrape(options, progressCallback); + + // Verify the processed document preserves the original depth + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(1); + expect(docCalls[0][0].depth).toBe(2); + }); + }); }); diff --git a/src/scraper/strategies/WebScraperStrategy.ts b/src/scraper/strategies/WebScraperStrategy.ts index 274ae17a..ce1af2ae 100644 --- a/src/scraper/strategies/WebScraperStrategy.ts +++ b/src/scraper/strategies/WebScraperStrategy.ts @@ -1,13 +1,12 @@ -import type { Document, ProgressCallback } from "../../types"; import { logger } from "../../utils/logger"; import type { UrlNormalizerOptions } from "../../utils/url"; import { AutoDetectFetcher } from "../fetcher"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import { PipelineFactory } from "../pipelines/PipelineFactory"; -import type { ContentPipeline, ProcessedContent } from "../pipelines/types"; -import type { ScraperOptions, ScraperProgress } from "../types"; +import type { ContentPipeline, PipelineResult } from "../pipelines/types"; +import type { QueueItem, ScraperOptions } from "../types"; import { isInScope } from "../utils/scope"; -import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy"; +import { BaseScraperStrategy, type ProcessItemResult } from "./BaseScraperStrategy"; export interface WebScraperStrategyOptions { urlNormalizerOptions?: UrlNormalizerOptions; @@ -47,26 +46,51 @@ export class WebScraperStrategy extends BaseScraperStrategy { protected override async processItem( item: QueueItem, options: ScraperOptions, - _progressCallback?: ProgressCallback, // Base class passes it, but not used here - signal?: AbortSignal, // Add signal - ): Promise<{ document?: Document; links?: string[]; finalUrl?: string }> { + signal?: AbortSignal, + ): Promise { const { url } = item; try { - // Define fetch options, passing signal, followRedirects, and headers + // Check if this is a refresh operation (has pageId and etag) + const isRefresh = item.pageId !== undefined && item.etag !== undefined; + + // Define fetch options, passing signal, followRedirects, headers, and etag const fetchOptions = { signal, followRedirects: options.followRedirects, headers: options.headers, // Forward custom headers + etag: item.etag, // Pass ETag for conditional requests }; // Use AutoDetectFetcher which handles fallbacks automatically const rawContent: RawContent = await this.fetcher.fetch(url, fetchOptions); + // Handle NOT_MODIFIED status (HTTP 304) + if (rawContent.status === FetchStatus.NOT_MODIFIED) { + if (isRefresh) { + logger.debug(`✓ Page unchanged (304): ${url}`); + // Return empty result, no processing needed + return { url, links: [], status: FetchStatus.NOT_MODIFIED }; + } + // For non-refresh operations, 304 shouldn't happen + logger.warn(`⚠️ Unexpected 304 response for non-refresh operation: ${url}`); + return { url, links: [], status: FetchStatus.NOT_MODIFIED }; + } + + // Handle SUCCESS status (HTTP 200) + // For refresh operations with existing pages, mark for deletion before re-adding + const shouldRefresh = isRefresh && item.pageId; + if (shouldRefresh) { + logger.debug(`✓ Refreshing page content: ${url}`); + } + // --- Start Pipeline Processing --- - let processed: ProcessedContent | undefined; + let processed: PipelineResult | undefined; for (const pipeline of this.pipelines) { - if (pipeline.canProcess(rawContent)) { + const contentBuffer = Buffer.isBuffer(rawContent.content) + ? rawContent.content + : Buffer.from(rawContent.content); + if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) { logger.debug( `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${url})`, ); @@ -79,11 +103,11 @@ export class WebScraperStrategy extends BaseScraperStrategy { logger.warn( `⚠️ Unsupported content type "${rawContent.mimeType}" for URL ${url}. Skipping processing.`, ); - return { document: undefined, links: [] }; + return { url, links: [], status: FetchStatus.SUCCESS }; } // Log errors from pipeline - for (const err of processed.errors) { + for (const err of processed.errors ?? []) { logger.warn(`⚠️ Processing error for ${url}: ${err.message}`); } @@ -92,48 +116,48 @@ export class WebScraperStrategy extends BaseScraperStrategy { logger.warn( `⚠️ No processable content found for ${url} after pipeline execution.`, ); - return { document: undefined, links: processed.links }; + return { + url, + links: processed.links, + status: FetchStatus.SUCCESS, + }; } - // Determine base for scope filtering: - // For depth 0 (initial page) use the final fetched URL (rawContent.source) so protocol/host redirects don't drop links. - // For deeper pages, use canonicalBaseUrl (set after first page) or fallback to original. - const baseUrl = - item.depth === 0 - ? new URL(rawContent.source) - : (this.canonicalBaseUrl ?? new URL(options.url)); - - const filteredLinks = processed.links.filter((link) => { - try { - const targetUrl = new URL(link); - const scope = options.scope || "subpages"; - return ( - isInScope(baseUrl, targetUrl, scope) && - (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl)) - ); - } catch { - return false; - } - }); + // For refresh operations, don't extract or follow links + let filteredLinks: string[] = []; + + if (!isRefresh) { + // Determine base for scope filtering: + // For depth 0 (initial page) use the final fetched URL (rawContent.source) so protocol/host redirects don't drop links. + // For deeper pages, use canonicalBaseUrl (set after first page) or fallback to original. + const baseUrl = + item.depth === 0 + ? new URL(rawContent.source) + : (this.canonicalBaseUrl ?? new URL(options.url)); + + filteredLinks = + processed.links?.filter((link) => { + try { + const targetUrl = new URL(link); + const scope = options.scope || "subpages"; + return ( + isInScope(baseUrl, targetUrl, scope) && + (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl)) + ); + } catch { + return false; + } + }) ?? []; + } return { - document: { - content: processed.textContent, - metadata: { - url, - title: - typeof processed.metadata.title === "string" - ? processed.metadata.title - : "Untitled", - library: options.library, - version: options.version, - etag: rawContent.etag, - lastModified: rawContent.lastModified, - ...processed.metadata, - }, - } satisfies Document, + url, + etag: rawContent.etag, + lastModified: rawContent.lastModified, + contentType: rawContent.mimeType, + content: processed, links: filteredLinks, - finalUrl: rawContent.source, + status: FetchStatus.SUCCESS, }; } catch (error) { // Log fetch errors or pipeline execution errors (if run throws) diff --git a/src/scraper/types.ts b/src/scraper/types.ts index 05b670d8..d392f4c2 100644 --- a/src/scraper/types.ts +++ b/src/scraper/types.ts @@ -1,4 +1,15 @@ -import type { Document, ProgressCallback } from "../types"; +import type { Chunk } from "../splitter/types"; +import type { ProgressCallback } from "../types"; + +/** + * Represents an item in the scraping queue + */ +export type QueueItem = { + url: string; + depth: number; + pageId?: number; // Database page ID for efficient deletion during refresh + etag?: string | null; // Last known ETag for conditional requests during refresh +}; /** * Enum defining the available HTML processing strategies. @@ -16,7 +27,7 @@ export interface ScraperStrategy { canHandle(url: string): boolean; scrape( options: ScraperOptions, - progressCallback: ProgressCallback, + progressCallback: ProgressCallback, signal?: AbortSignal, // Add optional signal ): Promise; @@ -28,7 +39,16 @@ export interface ScraperStrategy { } /** - * Options for configuring the scraping process + * Internal runtime options for configuring the scraping process. + * + * This is the comprehensive configuration object used by ScraperService, PipelineWorker, + * and scraper strategies. It includes both: + * - User-facing options (provided via tools like scrape_docs) + * - System-managed options (set internally by PipelineManager) + * + * Note: User-facing tools should NOT expose all these options directly. Instead, + * PipelineManager is responsible for translating user input into this complete + * runtime configuration. */ export interface ScraperOptions { url: string; @@ -76,28 +96,75 @@ export interface ScraperOptions { * Keys are header names, values are header values. */ headers?: Record; + /** + * Pre-populated queue of pages to visit. + * When provided: + * - Disables link discovery and crawling + * - Processes only the provided URLs + * - Uses provided metadata (pageId, etag) for optimization + */ + initialQueue?: QueueItem[]; + /** + * Indicates whether this is a refresh operation (re-indexing existing version). + * When true: + * - Skips initial removeAllDocuments call to preserve existing data + * - Uses ETags for conditional requests + * - Only updates changed/deleted pages + * @default false + */ + isRefresh?: boolean; } /** - * Result of scraping a single page. Used internally by HtmlScraper. + * Result of scraping a single page. */ -export interface ScrapedPage { - content: string; - title: string; +export interface ScrapeResult { + /** The URL of the page that was scraped */ url: string; - /** URLs extracted from page links, used for recursive scraping */ + /** Page title */ + title: string; + /** MIME type of the content being processed */ + contentType: string; + /** The final processed content, typically as a string (e.g., Markdown). Used primarily for debugging */ + textContent: string; + /** Extracted links from the content. */ links: string[]; + /** Any non-critical errors encountered during processing. */ + errors: Error[]; + /** Pre-split chunks from pipeline processing */ + chunks: Chunk[]; + /** ETag from HTTP response for caching */ + etag?: string | null; + /** Last-Modified from HTTP response for caching */ + lastModified?: string | null; } /** * Progress information during scraping */ -export interface ScraperProgress { +export interface ScraperProgressEvent { + /** Number of pages successfully scraped so far */ pagesScraped: number; - totalPages: number; // Effective total pages (limited by maxPages configuration) - totalDiscovered: number; // Actual number of pages discovered (may exceed totalPages) + /** + * Maximum number of pages to scrape (from maxPages option). + * May be undefined if no limit is set. + */ + totalPages: number; + /** + * Total number of URLs discovered during crawling. + * This may be higher than totalPages if maxPages limit is reached. + */ + totalDiscovered: number; + /** Current URL being processed */ currentUrl: string; + /** Current depth in the crawl tree */ depth: number; + /** Maximum depth allowed (from maxDepth option) */ maxDepth: number; - document?: Document; + /** The result of scraping the current page, if available. This may be null if the page has been deleted or if an error occurred. */ + result: ScrapeResult | null; + /** Database page ID (for refresh operations or tracking) */ + pageId?: number; + /** Indicates this page was deleted (404 during refresh or broken link) */ + deleted?: boolean; } diff --git a/src/splitter/GreedySplitter.test.ts b/src/splitter/GreedySplitter.test.ts index 9a604190..8ef7e19c 100644 --- a/src/splitter/GreedySplitter.test.ts +++ b/src/splitter/GreedySplitter.test.ts @@ -1,12 +1,12 @@ import { describe, expect, it, vi } from "vitest"; import { GreedySplitter } from "./GreedySplitter"; import { SemanticMarkdownSplitter } from "./SemanticMarkdownSplitter"; -import type { ContentChunk } from "./types"; +import type { Chunk } from "./types"; vi.mock("../utils/logger"); // Mock SemanticMarkdownSplitter -const createMockSemanticSplitter = (chunks: ContentChunk[]) => { +const createMockSemanticSplitter = (chunks: Chunk[]) => { const mockSplitText = vi.fn().mockResolvedValue(chunks); const mockSemanticSplitter = { splitText: mockSplitText, @@ -23,7 +23,7 @@ describe("GreedySplitter", () => { }); it("should return the original chunk if it's within min and max size", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "This is a single chunk.", @@ -37,7 +37,7 @@ describe("GreedySplitter", () => { }); it("should concatenate chunks until minChunkSize is reached", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "Short text 1.", @@ -62,7 +62,7 @@ describe("GreedySplitter", () => { }); it("should respect H1/H2 boundaries", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "Text before heading.", @@ -102,7 +102,7 @@ describe("GreedySplitter", () => { }); it("should not exceed preferredChunkSize", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "This is a long text chunk. ", @@ -132,7 +132,7 @@ describe("GreedySplitter", () => { }); it("should preserve section metadata when concatenating chunks with identical sections", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "Short text 1.", @@ -157,7 +157,7 @@ describe("GreedySplitter", () => { }); it("should merge heading with its content when minChunkSize > 0", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["heading"], content: "# Section 1", @@ -182,7 +182,7 @@ describe("GreedySplitter", () => { }); it("should keep heading separate when minChunkSize = 0", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["heading"], content: "# Section 1", @@ -201,7 +201,7 @@ describe("GreedySplitter", () => { }); it("should use deeper path when merging parent with child section", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "Parent content", @@ -232,7 +232,7 @@ describe("GreedySplitter", () => { }); it("should use common parent when merging sibling sections", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "First subsection", @@ -266,7 +266,7 @@ describe("GreedySplitter", () => { }); it("should use root when merging sections with no common path", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "First section", @@ -300,7 +300,7 @@ describe("GreedySplitter", () => { }); it("should handle deeply nested sections", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "Level 1", @@ -333,7 +333,7 @@ describe("GreedySplitter", () => { }); it("should handle deep sibling sections with common parent", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ // Deep sibling sections under Section 1 -> SubSection 1.1 { types: ["text"], diff --git a/src/splitter/GreedySplitter.ts b/src/splitter/GreedySplitter.ts index ba068f99..b1181bd1 100644 --- a/src/splitter/GreedySplitter.ts +++ b/src/splitter/GreedySplitter.ts @@ -1,4 +1,4 @@ -import type { ContentChunk, DocumentSplitter, SectionContentType } from "./types"; +import type { Chunk, DocumentSplitter, SectionContentType } from "./types"; /** * Takes small document chunks and greedily concatenates them into larger, more meaningful units @@ -36,10 +36,10 @@ export class GreedySplitter implements DocumentSplitter { * section boundaries to maintain document structure. This balances the need for * context with semantic coherence. */ - async splitText(markdown: string, contentType?: string): Promise { + async splitText(markdown: string, contentType?: string): Promise { const initialChunks = await this.baseSplitter.splitText(markdown, contentType); - const concatenatedChunks: ContentChunk[] = []; - let currentChunk: ContentChunk | null = null; + const concatenatedChunks: Chunk[] = []; + let currentChunk: Chunk | null = null; for (const nextChunk of initialChunks) { if (currentChunk) { @@ -71,7 +71,7 @@ export class GreedySplitter implements DocumentSplitter { return concatenatedChunks; } - private cloneChunk(chunk: ContentChunk): ContentChunk { + private cloneChunk(chunk: Chunk): Chunk { return { types: [...chunk.types], content: chunk.content, @@ -86,7 +86,7 @@ export class GreedySplitter implements DocumentSplitter { * H1 and H2 headings represent major conceptual breaks in the document. * Preserving these splits helps maintain the document's logical structure. */ - private startsNewMajorSection(chunk: ContentChunk): boolean { + private startsNewMajorSection(chunk: Chunk): boolean { return chunk.section.level === 1 || chunk.section.level === 2; } @@ -94,10 +94,7 @@ export class GreedySplitter implements DocumentSplitter { * Size limit check to ensure chunks remain within embedding model constraints. * Essential for maintaining consistent embedding quality and avoiding truncation. */ - private wouldExceedMaxSize( - currentChunk: ContentChunk | null, - nextChunk: ContentChunk, - ): boolean { + private wouldExceedMaxSize(currentChunk: Chunk | null, nextChunk: Chunk): boolean { if (!currentChunk) { return false; } @@ -122,10 +119,7 @@ export class GreedySplitter implements DocumentSplitter { * - For siblings/unrelated sections, uses the common parent path * - If no common path exists, uses the root path ([]) */ - private mergeSectionInfo( - currentChunk: ContentChunk, - nextChunk: ContentChunk, - ): ContentChunk["section"] { + private mergeSectionInfo(currentChunk: Chunk, nextChunk: Chunk): Chunk["section"] { // Always use the lowest level const level = Math.min(currentChunk.section.level, nextChunk.section.level); diff --git a/src/splitter/JsonDocumentSplitter.ts b/src/splitter/JsonDocumentSplitter.ts index 6bbc21fe..c06d10b7 100644 --- a/src/splitter/JsonDocumentSplitter.ts +++ b/src/splitter/JsonDocumentSplitter.ts @@ -13,7 +13,7 @@ * 5. Let GreedySplitter handle size optimization */ -import type { ContentChunk, DocumentSplitter } from "./types"; +import type { Chunk, DocumentSplitter } from "./types"; type JsonValue = | string @@ -35,10 +35,10 @@ export class JsonDocumentSplitter implements DocumentSplitter { this.preserveFormatting = options.preserveFormatting ?? true; } - async splitText(content: string, _contentType?: string): Promise { + async splitText(content: string, _contentType?: string): Promise { try { const parsed: JsonValue = JSON.parse(content); - const chunks: ContentChunk[] = []; + const chunks: Chunk[] = []; // Process the JSON structure recursively, starting with root path this.processValue(parsed, ["root"], 1, 0, chunks, true); @@ -64,7 +64,7 @@ export class JsonDocumentSplitter implements DocumentSplitter { path: string[], level: number, indentLevel: number, - chunks: ContentChunk[], + chunks: Chunk[], isLastItem: boolean, ): void { if (Array.isArray(value)) { @@ -81,7 +81,7 @@ export class JsonDocumentSplitter implements DocumentSplitter { path: string[], level: number, indentLevel: number, - chunks: ContentChunk[], + chunks: Chunk[], isLastItem: boolean, ): void { const indent = this.getIndent(indentLevel); @@ -114,7 +114,7 @@ export class JsonDocumentSplitter implements DocumentSplitter { path: string[], level: number, indentLevel: number, - chunks: ContentChunk[], + chunks: Chunk[], isLastItem: boolean, ): void { const indent = this.getIndent(indentLevel); @@ -157,7 +157,7 @@ export class JsonDocumentSplitter implements DocumentSplitter { path: string[], level: number, indentLevel: number, - chunks: ContentChunk[], + chunks: Chunk[], isLastProperty: boolean, ): void { const indent = this.getIndent(indentLevel); @@ -189,7 +189,7 @@ export class JsonDocumentSplitter implements DocumentSplitter { path: string[], level: number, indentLevel: number, - chunks: ContentChunk[], + chunks: Chunk[], isLastItem: boolean, ): void { const indent = this.getIndent(indentLevel); diff --git a/src/splitter/SemanticMarkdownSplitter.ts b/src/splitter/SemanticMarkdownSplitter.ts index ddeb9de7..c53e85bb 100644 --- a/src/splitter/SemanticMarkdownSplitter.ts +++ b/src/splitter/SemanticMarkdownSplitter.ts @@ -11,7 +11,7 @@ import { ContentSplitterError, MinimumChunkSizeError } from "./errors"; import { CodeContentSplitter } from "./splitters/CodeContentSplitter"; import { TableContentSplitter } from "./splitters/TableContentSplitter"; import { TextContentSplitter } from "./splitters/TextContentSplitter"; -import type { ContentChunk, DocumentSplitter, SectionContentType } from "./types"; +import type { Chunk, DocumentSplitter, SectionContentType } from "./types"; /** * Represents a section of content within a document, @@ -101,7 +101,7 @@ export class SemanticMarkdownSplitter implements DocumentSplitter { /** * Main entry point for splitting markdown content */ - async splitText(markdown: string, _contentType?: string): Promise { + async splitText(markdown: string, _contentType?: string): Promise { // Note: JSON content is now handled by dedicated JsonDocumentSplitter in JsonPipeline // This splitter focuses on markdown, HTML, and plain text content @@ -219,10 +219,8 @@ export class SemanticMarkdownSplitter implements DocumentSplitter { /** * Step 2: Split section content into smaller chunks */ - private async splitSectionContent( - sections: DocumentSection[], - ): Promise { - const chunks: ContentChunk[] = []; + private async splitSectionContent(sections: DocumentSection[]): Promise { + const chunks: Chunk[] = []; for (const section of sections) { for (const content of section.content) { @@ -296,7 +294,7 @@ export class SemanticMarkdownSplitter implements DocumentSplitter { // Create chunks from split content chunks.push( ...splitContent.map( - (text): ContentChunk => ({ + (text): Chunk => ({ types: [content.type], content: text, section: { diff --git a/src/splitter/TextDocumentSplitter.ts b/src/splitter/TextDocumentSplitter.ts index 42748bca..f64b45ce 100644 --- a/src/splitter/TextDocumentSplitter.ts +++ b/src/splitter/TextDocumentSplitter.ts @@ -9,7 +9,7 @@ import { SPLITTER_MAX_CHUNK_SIZE } from "../utils"; import { TextContentSplitter } from "./splitters/TextContentSplitter"; -import type { ContentChunk, DocumentSplitter } from "./types"; +import type { Chunk, DocumentSplitter } from "./types"; /** * Configuration options for text document splitting @@ -39,7 +39,7 @@ export class TextDocumentSplitter implements DocumentSplitter { }); } - async splitText(content: string): Promise { + async splitText(content: string): Promise { if (!content.trim()) { return []; } diff --git a/src/splitter/treesitter/TreesitterSourceCodeSplitter.ts b/src/splitter/treesitter/TreesitterSourceCodeSplitter.ts index 52b38d4c..1d6a7b92 100644 --- a/src/splitter/treesitter/TreesitterSourceCodeSplitter.ts +++ b/src/splitter/treesitter/TreesitterSourceCodeSplitter.ts @@ -8,7 +8,7 @@ import { SPLITTER_MAX_CHUNK_SIZE } from "../../utils"; import { TextContentSplitter } from "../splitters/TextContentSplitter"; -import type { ContentChunk, DocumentSplitter } from "../types"; +import type { Chunk, DocumentSplitter } from "../types"; import { LanguageParserRegistry } from "./LanguageParserRegistry"; import type { CodeBoundary, LanguageParser } from "./parsers/types"; @@ -41,7 +41,7 @@ export class TreesitterSourceCodeSplitter implements DocumentSplitter { }); } - async splitText(content: string, contentType?: string): Promise { + async splitText(content: string, contentType?: string): Promise { if (!content.trim()) { return []; } @@ -89,7 +89,7 @@ export class TreesitterSourceCodeSplitter implements DocumentSplitter { /** * Helper method to fall back to TextContentSplitter and convert results to ContentChunk[] */ - private async fallbackToTextSplitter(content: string): Promise { + private async fallbackToTextSplitter(content: string): Promise { const textChunks = await this.textContentSplitter.split(content); return textChunks.map((chunk) => ({ types: ["code"], @@ -173,7 +173,7 @@ export class TreesitterSourceCodeSplitter implements DocumentSplitter { content: string, path: string[], level: number, - ): Promise { + ): Promise { // Preserve whitespace-only content if it fits within chunk size (for perfect reconstruction) // Only skip if content is completely empty if (content.length === 0) { @@ -223,7 +223,7 @@ export class TreesitterSourceCodeSplitter implements DocumentSplitter { boundaries: CodeBoundary[], content: string, _contentType?: string, - ): Promise { + ): Promise { const lines = content.split("\n"); const totalLines = lines.length; @@ -299,7 +299,7 @@ export class TreesitterSourceCodeSplitter implements DocumentSplitter { } // Step 4: Convert segments directly to chunks (whitespace retained verbatim) - const chunks: ContentChunk[] = []; + const chunks: Chunk[] = []; // Ensure only ONE structural chunk is emitted per structural boundary. const structuralBoundaryFirstChunk = new Set(); diff --git a/src/splitter/types.ts b/src/splitter/types.ts index 40086eb7..0d48ccc2 100644 --- a/src/splitter/types.ts +++ b/src/splitter/types.ts @@ -6,7 +6,7 @@ export type SectionContentType = "text" | "code" | "table" | "heading" | "struct /** * Final output chunk after processing and size-based splitting */ -export interface ContentChunk { +export interface Chunk { types: SectionContentType[]; content: string; section: { @@ -19,5 +19,5 @@ export interface ContentChunk { * Interface for a splitter that processes markdown content into chunks */ export interface DocumentSplitter { - splitText(markdown: string, contentType?: string): Promise; + splitText(markdown: string, contentType?: string): Promise; } diff --git a/src/store/DocumentManagementService.test.ts b/src/store/DocumentManagementService.test.ts index 343b32d7..6095bd18 100644 --- a/src/store/DocumentManagementService.test.ts +++ b/src/store/DocumentManagementService.test.ts @@ -1,12 +1,7 @@ import path from "node:path"; -import { Document } from "@langchain/core/documents"; import { createFsFromVolume, vol } from "memfs"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; -import { - LibraryNotFoundInStoreError, - StoreError, - VersionNotFoundInStoreError, -} from "./errors"; +import { LibraryNotFoundInStoreError, VersionNotFoundInStoreError } from "./errors"; vi.mock("node:fs", () => ({ default: createFsFromVolume(vol), @@ -270,108 +265,6 @@ describe("DocumentManagementService", () => { expect(mockStore.checkDocumentExists).toHaveBeenCalledWith("test-lib", "1.0.0"); }); - describe("document processing", () => { - it("should add and search documents with basic metadata", async () => { - const library = "test-lib"; - const version = "1.0.0"; - const validDocument = new Document({ - pageContent: "Test document content about testing", - metadata: { - url: "http://example.com", - title: "Test Doc", - }, - }); - - const documentNoUrl = new Document({ - pageContent: "Test document without URL", - metadata: { - title: "Test Doc", - }, - }); - - // Should fail when URL is missing - await expect( - docService.addDocument(library, version, documentNoUrl), - ).rejects.toThrow(StoreError); - - await expect( - docService.addDocument(library, version, documentNoUrl), - ).rejects.toHaveProperty("message", "Document metadata must include a valid URL"); - - // Should succeed with valid URL - mockRetriever.search.mockResolvedValue(["Mocked search result"]); - - await docService.addDocument(library, version, validDocument); - - const results = await docService.searchStore(library, version, "testing"); - expect(mockStore.addDocuments).toHaveBeenCalledWith( - // Fix: Use mockStoreInstance - library, - version, - expect.arrayContaining([ - expect.objectContaining({ pageContent: validDocument.pageContent }), - ]), - ); - expect(results).toEqual(["Mocked search result"]); // Expect mocked result - }); - - it("should preserve semantic metadata when processing markdown documents", async () => { - const library = "test-lib"; - const version = "1.0.0"; - const document = new Document({ - pageContent: "# Chapter 1\nTest content\n## Section 1.1\nMore testing content", - metadata: { - url: "http://example.com/docs", - title: "Root Doc", - }, - }); - - // Mock the search result to match what would actually be stored after processing - mockRetriever.search.mockResolvedValue(["Mocked search result"]); - - await docService.addDocument(library, version, document); - - // Verify the documents were stored with semantic metadata - expect(mockStore.addDocuments).toHaveBeenCalledWith( - library, - version, - expect.arrayContaining([ - expect.objectContaining({ - metadata: expect.objectContaining({ - level: 0, - path: [], - }), - }), - ]), - ); - - // Verify search results preserve metadata - const results = await docService.searchStore(library, version, "testing"); - expect(results).toEqual(["Mocked search result"]); - }); - - it("should handle unsupported content types gracefully", async () => { - const library = "test-lib"; - const version = "1.0.0"; - const binaryDocument = new Document({ - pageContent: "binary content with null bytes\0", - metadata: { - url: "http://example.com/image.png", - title: "Binary Image", - mimeType: "image/png", - }, - }); - - // Should not throw an error, just log a warning and return early - await expect( - docService.addDocument(library, version, binaryDocument), - ).resolves.toBeUndefined(); - - // Verify that no documents were added to the store - expect(mockStore.addDocuments).not.toHaveBeenCalled(); - }); - }); - it("should remove all documents for a specific library and version", async () => { const library = "test-lib"; const version = "1.0.0"; @@ -768,46 +661,21 @@ describe("DocumentManagementService", () => { // Tests for handling optional version parameter (null/undefined/"") describe("Optional Version Handling", () => { const library = "opt-lib"; - const doc = new Document({ - pageContent: "Optional version test", - metadata: { url: "http://opt.com" }, - }); const query = "optional"; it("exists should normalize version to empty string", async () => { await docService.exists(library, null); - expect(mockStore.checkDocumentExists).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance + expect(mockStore.checkDocumentExists).toHaveBeenCalledWith(library, ""); await docService.exists(library, undefined); - expect(mockStore.checkDocumentExists).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance + expect(mockStore.checkDocumentExists).toHaveBeenCalledWith(library, ""); await docService.exists(library, ""); - expect(mockStore.checkDocumentExists).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance - }); - - it("addDocument should normalize version to empty string", async () => { - await docService.addDocument(library, null, doc); - expect(mockStore.addDocuments).toHaveBeenCalledWith( - library, - "", - expect.any(Array), - ); // Fix: Use mockStoreInstance - await docService.addDocument(library, undefined, doc); - expect(mockStore.addDocuments).toHaveBeenCalledWith( - library, - "", - expect.any(Array), - ); // Fix: Use mockStoreInstance - await docService.addDocument(library, "", doc); - expect(mockStore.addDocuments).toHaveBeenCalledWith( - library, - "", - expect.any(Array), - ); // Fix: Use mockStoreInstance + expect(mockStore.checkDocumentExists).toHaveBeenCalledWith(library, ""); }); it("searchStore should normalize version to empty string", async () => { // Call without explicit limit, should use default limit of 5 await docService.searchStore(library, null, query); - expect(mockRetriever.search).toHaveBeenCalledWith(library, "", query, 5); // Expect default limit 5 + expect(mockRetriever.search).toHaveBeenCalledWith(library, "", query, 5); // Call with explicit limit await docService.searchStore(library, undefined, query, 7); diff --git a/src/store/DocumentManagementService.ts b/src/store/DocumentManagementService.ts index dfddd641..b2895da3 100644 --- a/src/store/DocumentManagementService.ts +++ b/src/store/DocumentManagementService.ts @@ -1,5 +1,4 @@ import path from "node:path"; -import type { Document } from "@langchain/core/documents"; import Fuse from "fuse.js"; import semver from "semver"; import { @@ -7,9 +6,8 @@ import { PipelineFactory, } from "../scraper/pipelines/PipelineFactory"; import type { ContentPipeline } from "../scraper/pipelines/types"; -import type { ScraperOptions } from "../scraper/types"; -import { ScrapeMode } from "../scraper/types"; -import type { ContentChunk } from "../splitter/types"; +import type { ScrapeResult, ScraperOptions } from "../scraper/types"; +import type { Chunk } from "../splitter/types"; import { analytics, extractHostname, TelemetryEvent } from "../telemetry"; import { logger } from "../utils/logger"; import { DocumentRetrieverService } from "./DocumentRetrieverService"; @@ -344,19 +342,21 @@ export class DocumentManagementService { * This is more efficient than URL-based deletion when the page ID is known. */ async removeDocumentsByPageId(pageId: number): Promise { - logger.debug(`🗑️ Removing documents for page ID: ${pageId}`); + logger.debug(`Removing documents for page ID: ${pageId}`); const count = await this.store.deleteDocumentsByPageId(pageId); - logger.debug(`🗑️ Deleted ${count} documents for page ID: ${pageId}`); + logger.info(`🗑️ Deleted ${count} documents`); return count; } /** * Retrieves all pages for a specific version ID with their metadata. - * Used for refresh operations to get existing pages with their ETags. + * Used for refresh operations to get existing pages with their ETags and depths. */ async getPagesByVersionId( versionId: number, - ): Promise> { + ): Promise< + Array<{ id: number; url: string; etag: string | null; depth: number | null }> + > { return this.store.getPagesByVersionId(versionId); } @@ -368,18 +368,16 @@ export class DocumentManagementService { */ async removeVersion(library: string, version?: string | null): Promise { const normalizedVersion = this.normalizeVersion(version); - logger.info(`🗑️ Removing version: ${library}@${normalizedVersion || "[no version]"}`); + logger.debug(`Removing version: ${library}@${normalizedVersion || "[no version]"}`); const result = await this.store.removeVersion(library, normalizedVersion, true); - logger.info( - `🗑️ Removed ${result.documentsDeleted} documents, version: ${result.versionDeleted}, library: ${result.libraryDeleted}`, - ); + logger.info(`🗑️ Removed ${result.documentsDeleted} documents`); if (result.versionDeleted && result.libraryDeleted) { - logger.info(`✅ Completely removed library ${library} (was last version)`); + logger.info(`🗑️ Completely removed library ${library} (was last version)`); } else if (result.versionDeleted) { - logger.info(`✅ Removed version ${library}@${normalizedVersion || "[no version]"}`); + logger.info(`🗑️ Removed version ${library}@${normalizedVersion || "[no version]"}`); } else { logger.warn( `⚠️ Version ${library}@${normalizedVersion || "[no version]"} not found`, @@ -388,108 +386,71 @@ export class DocumentManagementService { } /** - * Adds a document to the store, splitting it into smaller chunks for better search results. - * Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting. - * Preserves hierarchical structure of documents and distinguishes between text and code segments. - * If version is omitted, the document is added without a specific version. + * Adds pre-processed content directly to the store. + * This method is used when content has already been processed by a pipeline, + * avoiding redundant processing. Used primarily by the scraping pipeline. + * + * @param library Library name + * @param version Version string (null/undefined for unversioned) + * @param processed Pre-processed content with chunks already created + * @param pageId Optional page ID for refresh operations */ - async addDocument( + async addScrapeResult( library: string, version: string | null | undefined, - document: Document, + depth: number, + result: ScrapeResult, ): Promise { const processingStart = performance.now(); const normalizedVersion = this.normalizeVersion(version); - const url = document.metadata.url as string; - - if (!url || typeof url !== "string" || !url.trim()) { - throw new StoreError("Document metadata must include a valid URL"); + const { url, title, chunks, contentType } = result; + if (!url) { + throw new StoreError("Processed content metadata must include a valid URL"); } - logger.info(`📚 Adding document: ${document.metadata.title}`); + logger.info(`📚 Adding processed content: ${title || url}`); - if (!document.pageContent.trim()) { - throw new Error("Document content cannot be empty"); + if (chunks.length === 0) { + logger.warn(`⚠️ No chunks in processed content for ${url}. Skipping.`); + return; } - const contentType = document.metadata.mimeType as string | undefined; - try { - // Create a mock RawContent for pipeline selection - const rawContent = { - source: url, - content: document.pageContent, - mimeType: contentType || "text/plain", - }; - - // Find appropriate pipeline for content type - const pipeline = this.pipelines.find((p) => p.canProcess(rawContent)); - - if (!pipeline) { - logger.warn( - `⚠️ Unsupported content type "${rawContent.mimeType}" for document ${url}. Skipping processing.`, - ); - return; - } - - // Debug logging for pipeline selection - logger.debug( - `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${url})`, - ); - - // Use content-type-specific pipeline for processing and splitting - // Create minimal scraper options for processing - const scraperOptions = { - url: url, - library: library, - version: normalizedVersion, - scrapeMode: ScrapeMode.Fetch, - ignoreErrors: false, - maxConcurrency: 1, - }; - - const processed = await pipeline.process(rawContent, scraperOptions); - const chunks = processed.chunks; - - // Convert semantic chunks to documents - const splitDocs = chunks.map((chunk: ContentChunk) => ({ - pageContent: chunk.content, - metadata: { - ...document.metadata, - level: chunk.section.level, - path: chunk.section.path, - }, - })); - logger.info(`✂️ Split document into ${splitDocs.length} chunks`); + logger.info(`✂️ Storing ${chunks.length} pre-split chunks`); // Add split documents to store - await this.store.addDocuments(library, normalizedVersion, splitDocs); + await this.store.addDocuments(library, normalizedVersion, depth, result); // Track successful document processing const processingTime = performance.now() - processingStart; + const totalContentSize = chunks.reduce( + (sum: number, chunk: Chunk) => sum + chunk.content.length, + 0, + ); + analytics.track(TelemetryEvent.DOCUMENT_PROCESSED, { // Content characteristics (privacy-safe) - mimeType: contentType || "unknown", - contentSizeBytes: document.pageContent.length, + mimeType: contentType, + contentSizeBytes: totalContentSize, // Processing metrics processingTimeMs: Math.round(processingTime), - chunksCreated: splitDocs.length, + chunksCreated: chunks.length, // Document characteristics - hasTitle: !!document.metadata.title, - hasDescription: !!document.metadata.description, + hasTitle: !!title, + // hasDescription: !!processed.metadata.description, urlDomain: extractHostname(url), - depth: document.metadata.depth, + depth, // Library context library, libraryVersion: normalizedVersion || null, // Processing efficiency - avgChunkSizeBytes: Math.round(document.pageContent.length / splitDocs.length), + avgChunkSizeBytes: Math.round(totalContentSize / chunks.length), processingSpeedKbPerSec: Math.round( - document.pageContent.length / 1024 / (processingTime / 1000), + totalContentSize / 1024 / (processingTime / 1000), ), }); } catch (error) { @@ -498,12 +459,15 @@ export class DocumentManagementService { if (error instanceof Error) { analytics.captureException(error, { - mimeType: contentType || "unknown", - contentSizeBytes: document.pageContent.length, + mimeType: contentType, + contentSizeBytes: chunks.reduce( + (sum: number, chunk: Chunk) => sum + chunk.content.length, + 0, + ), processingTimeMs: Math.round(processingTime), library, libraryVersion: normalizedVersion || null, - context: "document_processing", + context: "processed_content_storage", component: DocumentManagementService.constructor.name, }); } diff --git a/src/store/DocumentRetrieverService.test.ts b/src/store/DocumentRetrieverService.test.ts index d01492c6..8b94e7e0 100644 --- a/src/store/DocumentRetrieverService.test.ts +++ b/src/store/DocumentRetrieverService.test.ts @@ -1,7 +1,7 @@ -import { Document } from "@langchain/core/documents"; import { beforeEach, describe, expect, it, vi } from "vitest"; import { DocumentRetrieverService } from "./DocumentRetrieverService"; import { DocumentStore } from "./DocumentStore"; +import type { DbChunkRank, DbPageChunk } from "./types"; vi.mock("./DocumentStore"); vi.mock("../utils/logger"); @@ -27,21 +27,26 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const version = "1.0.0"; const query = "test"; // Two initial hits from the same URL, with overlapping context - const initialResult1 = new Document({ + const initialResult1 = { id: "doc1", - pageContent: "Chunk A", - metadata: { url: "url", score: 0.9 }, - }); - const initialResult2 = new Document({ + content: "Chunk A", + url: "url", + score: 0.9, + metadata: {}, + } as DbPageChunk & DbChunkRank; + const initialResult2 = { id: "doc3", - pageContent: "Chunk C", - metadata: { url: "url", score: 0.8 }, - }); - const doc2 = new Document({ + content: "Chunk C", + url: "url", + score: 0.8, + metadata: {}, + } as DbPageChunk & DbChunkRank; + const doc2 = { id: "doc2", - pageContent: "Chunk B", - metadata: { url: "url" }, - }); + content: "Chunk B", + url: "url", + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([ initialResult1, @@ -85,21 +90,25 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const library = "lib"; const version = "1.0.0"; const query = "test"; - const initialResult = new Document({ + const initialResult = { id: "doc1", - pageContent: "Main chunk", - metadata: { url: "url", score: 0.7 }, - }); - const parent = new Document({ + content: "Main chunk", + score: 0.7, + url: "url", + metadata: {}, + } as DbPageChunk & DbChunkRank; + const parent = { id: "parent1", - pageContent: "Parent", - metadata: { url: "url" }, - }); - const child = new Document({ + content: "Parent", + url: "url", + metadata: {}, + } as DbPageChunk & DbChunkRank; + const child = { id: "child1", - pageContent: "Child", - metadata: { url: "url" }, - }); + content: "Child", + url: "url", + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([initialResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(parent); @@ -130,16 +139,20 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const library = "lib"; const version = "1.0.0"; const query = "test"; - const docA = new Document({ + const docA = { id: "a1", - pageContent: "A1", - metadata: { url: "urlA", score: 0.8 }, - }); - const docB = new Document({ + content: "A1", + url: "urlA", + score: 0.8, + metadata: {}, + } as DbPageChunk & DbChunkRank; + const docB = { id: "b1", - pageContent: "B1", - metadata: { url: "urlB", score: 0.9 }, - }); + content: "B1", + url: "urlB", + score: 0.9, + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([docA, docB]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -174,11 +187,13 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const library = "lib"; const version = "1.0.0"; const query = "test"; - const initialResult = new Document({ + const initialResult = { id: "doc1", - pageContent: "Main chunk", - metadata: { url: "url", score: 0.5 }, - }); + content: "Main chunk", + url: "url", + score: 0.5, + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([initialResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -210,11 +225,13 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const version = "1.0.0"; const query = "test"; const limit = 3; - const initialResult = new Document({ + const initialResult = { id: "doc1", - pageContent: "Main chunk", - metadata: { url: "url", score: 0.5 }, - }); + content: "Main chunk", + url: "url", + score: 0.5, + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([initialResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -247,11 +264,14 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const mimeType = "text/html"; // Create a document with mimeType in metadata - const initialResult = new Document({ + const initialResult = { id: "doc1", - pageContent: "HTML content", - metadata: { url: "https://example.com", score: 0.9, mimeType }, - }); + content: "HTML content", + url: "https://example.com", + score: 0.9, + content_type: mimeType, + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([initialResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -277,11 +297,13 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const query = "test"; // Create a document without mimeType in metadata - const initialResult = new Document({ + const initialResult = { id: "doc1", - pageContent: "Plain content", - metadata: { url: "https://example.com", score: 0.9 }, - }); + content: "Plain content", + url: "https://example.com", + score: 0.9, + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([initialResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -308,27 +330,27 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const query = "test"; // Child chunk with path ["Chapter 1", "Section 1.1"] - const childResult = new Document({ + const childResult = { id: "child1", - pageContent: "Child content", + content: "Child content", + url: "https://example.com", + score: 0.8, metadata: { - url: "https://example.com", - score: 0.8, path: ["Chapter 1", "Section 1.1"], level: 2, }, - }); + } as DbPageChunk & DbChunkRank; // Parent chunk with path ["Chapter 1"] - const parentChunk = new Document({ + const parentChunk = { id: "parent1", - pageContent: "Parent content", + content: "Parent content", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1"], level: 1, }, - }); + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([childResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(parentChunk); @@ -363,38 +385,38 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const query = "test"; // Main result chunk - const mainResult = new Document({ + const mainResult = { id: "main1", - pageContent: "Main content", + content: "Main content", + url: "https://example.com", + score: 0.9, metadata: { - url: "https://example.com", - score: 0.9, path: ["Chapter 1", "Section 1.2"], level: 2, }, - }); + } as DbPageChunk & DbChunkRank; // Preceding sibling with same path level - const precedingSibling = new Document({ + const precedingSibling = { id: "preceding1", - pageContent: "Preceding content", + content: "Preceding content", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1", "Section 1.1"], level: 2, }, - }); + } as DbPageChunk & DbChunkRank; // Subsequent sibling with same path level - const subsequentSibling = new Document({ + const subsequentSibling = { id: "subsequent1", - pageContent: "Subsequent content", + content: "Subsequent content", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1", "Section 1.3"], level: 2, }, - }); + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([mainResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -441,37 +463,37 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const query = "test"; // Parent result chunk - const parentResult = new Document({ + const parentResult = { id: "parent1", - pageContent: "Parent section", + content: "Parent section", + url: "https://example.com", + score: 0.7, metadata: { - url: "https://example.com", - score: 0.7, path: ["Chapter 1"], level: 1, }, - }); + } as DbPageChunk & DbChunkRank; // Child chunks at deeper level - const child1 = new Document({ + const child1 = { id: "child1", - pageContent: "First subsection", + content: "First subsection", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1", "Section 1.1"], level: 2, }, - }); + } as DbPageChunk & DbChunkRank; - const child2 = new Document({ + const child2 = { id: "child2", - pageContent: "Second subsection", + content: "Second subsection", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1", "Section 1.2"], level: 2, }, - }); + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([parentResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -508,27 +530,27 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const query = "test"; // Multiple chunks from same document/URL, returned out of sort_order - const chunk3 = new Document({ + const chunk3 = { id: "chunk3", - pageContent: "Third chunk", + content: "Third chunk", + url: "https://example.com", + score: 0.6, metadata: { - url: "https://example.com", - score: 0.6, path: ["Section C"], level: 1, }, - }); + } as DbPageChunk & DbChunkRank; - const chunk1 = new Document({ + const chunk1 = { id: "chunk1", - pageContent: "First chunk", + content: "First chunk", + url: "https://example.com", + score: 0.8, metadata: { - url: "https://example.com", - score: 0.8, path: ["Section A"], level: 1, }, - }); + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([chunk3, chunk1]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -558,60 +580,60 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const query = "test"; // Main search result - a subsection - const mainResult = new Document({ + const mainResult = { id: "main1", - pageContent: "Key subsection content", + content: "Key subsection content", + url: "https://example.com", + score: 0.9, metadata: { - url: "https://example.com", - score: 0.9, path: ["Guide", "Installation", "Setup"], level: 3, }, - }); + } as DbPageChunk & DbChunkRank; // Parent at level 2 - const parent = new Document({ + const parent = { id: "parent1", - pageContent: "Installation overview", + content: "Installation overview", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Guide", "Installation"], level: 2, }, - }); + } as DbPageChunk & DbChunkRank; // Preceding sibling at same level - const precedingSibling = new Document({ + const precedingSibling = { id: "preceding1", - pageContent: "Prerequisites section", + content: "Prerequisites section", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Guide", "Installation", "Prerequisites"], level: 3, }, - }); + } as DbPageChunk & DbChunkRank; // Child at deeper level - const child = new Document({ + const child = { id: "child1", - pageContent: "Detailed setup steps", + content: "Detailed setup steps", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Guide", "Installation", "Setup", "Steps"], level: 4, }, - }); + } as DbPageChunk & DbChunkRank; // Subsequent sibling - const subsequentSibling = new Document({ + const subsequentSibling = { id: "subsequent1", - pageContent: "Configuration section", + content: "Configuration section", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Guide", "Installation", "Configuration"], level: 3, }, - }); + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([mainResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(parent); @@ -652,15 +674,14 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const version = "1.0.0"; const query = "test"; - const markdownChunk = new Document({ + const markdownChunk = { id: "md1", - pageContent: "# Heading\n\nSome content", - metadata: { - url: "https://example.com/doc.md", - score: 0.9, - mimeType: "text/markdown", - }, - }); + content: "# Heading\n\nSome content", + url: "https://example.com/doc.md", + score: 0.9, + content_type: "text/markdown", + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([markdownChunk]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -685,15 +706,14 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const version = "1.0.0"; const query = "test"; - const codeChunk = new Document({ + const codeChunk = { id: "ts1", - pageContent: "function test() {\n return 'hello';\n}", - metadata: { - url: "https://example.com/code.ts", - score: 0.9, - mimeType: "text/x-typescript", - }, - }); + content: "function test() {\n return 'hello';\n}", + url: "https://example.com/code.ts", + score: 0.9, + content_type: "text/x-typescript", + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([codeChunk]); // Mock the hierarchical strategy's fallback behavior since we don't have full hierarchy implementation @@ -717,15 +737,14 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const version = "1.0.0"; const query = "test"; - const jsonChunk = new Document({ + const jsonChunk = { id: "json1", - pageContent: '{"key": "value"}', - metadata: { - url: "https://example.com/config.json", - score: 0.9, - mimeType: "application/json", - }, - }); + content: '{"key": "value"}', + url: "https://example.com/config.json", + score: 0.9, + content_type: "application/json", + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([jsonChunk]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -748,15 +767,14 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const version = "1.0.0"; const query = "test"; - const unknownChunk = new Document({ + const unknownChunk = { id: "unknown1", - pageContent: "Some content", - metadata: { - url: "https://example.com/unknown", - score: 0.9, - // No mimeType specified - }, - }); + content: "Some content", + url: "https://example.com/unknown", + score: 0.9, + // No mimeType specified + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([unknownChunk]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); diff --git a/src/store/DocumentRetrieverService.ts b/src/store/DocumentRetrieverService.ts index f0633a34..bfd66cab 100644 --- a/src/store/DocumentRetrieverService.ts +++ b/src/store/DocumentRetrieverService.ts @@ -1,7 +1,6 @@ -import type { Document } from "@langchain/core/documents"; import { createContentAssemblyStrategy } from "./assembly/ContentAssemblyStrategyFactory"; import type { DocumentStore } from "./DocumentStore"; -import type { StoreSearchResult } from "./types"; +import type { DbChunkRank, DbPageChunk, StoreSearchResult } from "./types"; export class DocumentRetrieverService { private documentStore: DocumentStore; @@ -59,11 +58,13 @@ export class DocumentRetrieverService { /** * Groups search results by URL. */ - private groupResultsByUrl(results: Document[]): Map { - const resultsByUrl = new Map(); + private groupResultsByUrl( + results: (DbPageChunk & DbChunkRank)[], + ): Map { + const resultsByUrl = new Map(); for (const result of results) { - const url = result.metadata.url as string; + const url = result.url; if (!resultsByUrl.has(url)) { resultsByUrl.set(url, []); } @@ -83,18 +84,14 @@ export class DocumentRetrieverService { library: string, version: string, url: string, - initialChunks: Document[], + initialChunks: (DbPageChunk & DbChunkRank)[], ): Promise { - // Extract mimeType from the first document's metadata - const mimeType = - initialChunks.length > 0 - ? (initialChunks[0].metadata.mimeType as string | undefined) - : undefined; + // Extract mimeType from the first document's content_type (page-level field) + // Convert null to undefined for consistency + const mimeType = initialChunks.length > 0 ? initialChunks[0].content_type : undefined; // Find the maximum score from the initial results - const maxScore = Math.max( - ...initialChunks.map((chunk) => chunk.metadata.score as number), - ); + const maxScore = Math.max(...initialChunks.map((chunk) => chunk.score)); // Create appropriate assembly strategy based on content type const strategy = createContentAssemblyStrategy(mimeType); diff --git a/src/store/DocumentStore.test.ts b/src/store/DocumentStore.test.ts index c14ebc56..28a93355 100644 --- a/src/store/DocumentStore.test.ts +++ b/src/store/DocumentStore.test.ts @@ -1,5 +1,6 @@ -import type { Document } from "@langchain/core/documents"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { ScrapeResult } from "../scraper/types"; +import type { Chunk } from "../splitter/types"; import { DocumentStore } from "./DocumentStore"; import { EmbeddingConfig } from "./embeddings/EmbeddingConfig"; import { VersionStatus } from "./types"; @@ -62,6 +63,41 @@ vi.mock("./embeddings/EmbeddingFactory", async (importOriginal) => { }; }); +/** + * Helper function to create minimal ScrapeResult for testing. + * Converts simplified test data to the ScrapeResult format expected by addDocuments. + */ +function createScrapeResult( + title: string, + url: string, + content: string, + path: string[] = [], + options?: { + etag?: string | null; + lastModified?: string | null; + }, +): ScrapeResult { + const chunks: Chunk[] = [ + { + types: ["text"], + content, + section: { level: 0, path }, + }, + ]; + + return { + url, + title, + contentType: "text/html", + textContent: content, + links: [], + errors: [], + chunks, + etag: options?.etag, + lastModified: options?.lastModified, + } satisfies ScrapeResult; +} + /** * Tests for DocumentStore with embeddings enabled * Uses explicit embedding configuration and tests hybrid search functionality @@ -88,26 +124,29 @@ describe("DocumentStore - With Embeddings", () => { describe("Document Storage and Retrieval", () => { it("should store and retrieve documents with proper metadata", async () => { - const docs: Document[] = [ - { - pageContent: "JavaScript programming tutorial with examples", - metadata: { - title: "JS Tutorial", - url: "https://example.com/js-tutorial", - path: ["programming", "javascript"], - }, - }, - { - pageContent: "Python data science guide with pandas", - metadata: { - title: "Python DS", - url: "https://example.com/python-ds", - path: ["programming", "python"], - }, - }, - ]; - - await store.addDocuments("testlib", "1.0.0", docs); + // Add two pages separately + await store.addDocuments( + "testlib", + "1.0.0", + 1, + createScrapeResult( + "JS Tutorial", + "https://example.com/js-tutorial", + "JavaScript programming tutorial with examples", + ["programming", "javascript"], + ), + ); + await store.addDocuments( + "testlib", + "1.0.0", + 1, + createScrapeResult( + "Python DS", + "https://example.com/python-ds", + "Python data science guide with pandas", + ["programming", "python"], + ), + ); // Verify documents were stored expect(await store.checkDocumentExists("testlib", "1.0.0")).toBe(true); @@ -136,18 +175,17 @@ describe("DocumentStore - With Embeddings", () => { }); it("should handle document deletion correctly", async () => { - const docs: Document[] = [ - { - pageContent: "Temporary document for deletion test", - metadata: { - title: "Temp Doc", - url: "https://example.com/temp", - path: ["temp"], - }, - }, - ]; - - await store.addDocuments("templib", "1.0.0", docs); + await store.addDocuments( + "templib", + "1.0.0", + 1, + createScrapeResult( + "Temp Doc", + "https://example.com/temp", + "Temporary document for deletion test", + ["temp"], + ), + ); expect(await store.checkDocumentExists("templib", "1.0.0")).toBe(true); const deletedCount = await store.deleteDocuments("templib", "1.0.0"); @@ -156,27 +194,29 @@ describe("DocumentStore - With Embeddings", () => { }); it("should completely remove a version including pages and documents", async () => { - const docs: Document[] = [ - { - pageContent: "First document for removal test", - metadata: { - title: "Doc 1", - url: "https://example.com/doc1", - path: ["docs"], - }, - }, - { - pageContent: "Second document for removal test", - metadata: { - title: "Doc 2", - url: "https://example.com/doc2", - path: ["docs"], - }, - }, - ]; - - // Add documents and verify they exist - await store.addDocuments("removelib", "1.0.0", docs); + // Add two pages + await store.addDocuments( + "removelib", + "1.0.0", + 1, + createScrapeResult( + "Doc 1", + "https://example.com/doc1", + "First document for removal test", + ["docs"], + ), + ); + await store.addDocuments( + "removelib", + "1.0.0", + 1, + createScrapeResult( + "Doc 2", + "https://example.com/doc2", + "Second document for removal test", + ["docs"], + ), + ); expect(await store.checkDocumentExists("removelib", "1.0.0")).toBe(true); // Remove the version @@ -192,31 +232,23 @@ describe("DocumentStore - With Embeddings", () => { }); it("should remove version but keep library when other versions exist", async () => { - const v1Docs: Document[] = [ - { - pageContent: "Version 1 document", - metadata: { - title: "V1 Doc", - url: "https://example.com/v1", - path: ["v1"], - }, - }, - ]; - - const v2Docs: Document[] = [ - { - pageContent: "Version 2 document", - metadata: { - title: "V2 Doc", - url: "https://example.com/v2", - path: ["v2"], - }, - }, - ]; - // Add two versions - await store.addDocuments("multilib", "1.0.0", v1Docs); - await store.addDocuments("multilib", "2.0.0", v2Docs); + await store.addDocuments( + "multilib", + "1.0.0", + 1, + createScrapeResult("V1 Doc", "https://example.com/v1", "Version 1 document", [ + "v1", + ]), + ); + await store.addDocuments( + "multilib", + "2.0.0", + 1, + createScrapeResult("V2 Doc", "https://example.com/v2", "Version 2 document", [ + "v2", + ]), + ); // Remove only version 1.0.0 const result = await store.removeVersion("multilib", "1.0.0", true); @@ -232,30 +264,28 @@ describe("DocumentStore - With Embeddings", () => { }); it("should handle multiple versions of the same library", async () => { - const v1Docs: Document[] = [ - { - pageContent: "Version 1.0 feature documentation", - metadata: { - title: "V1 Features", - url: "https://example.com/v1", - path: ["features"], - }, - }, - ]; - - const v2Docs: Document[] = [ - { - pageContent: "Version 2.0 feature documentation with new capabilities", - metadata: { - title: "V2 Features", - url: "https://example.com/v2", - path: ["features"], - }, - }, - ]; - - await store.addDocuments("versionlib", "1.0.0", v1Docs); - await store.addDocuments("versionlib", "2.0.0", v2Docs); + await store.addDocuments( + "versionlib", + "1.0.0", + 1, + createScrapeResult( + "V1 Features", + "https://example.com/v1", + "Version 1.0 feature documentation", + ["features"], + ), + ); + await store.addDocuments( + "versionlib", + "2.0.0", + 1, + createScrapeResult( + "V2 Features", + "https://example.com/v2", + "Version 2.0 feature documentation with new capabilities", + ["features"], + ), + ); expect(await store.checkDocumentExists("versionlib", "1.0.0")).toBe(true); expect(await store.checkDocumentExists("versionlib", "2.0.0")).toBe(true); @@ -269,20 +299,18 @@ describe("DocumentStore - With Embeddings", () => { const testEtag = '"abc123-def456"'; const testLastModified = "2023-12-01T10:30:00Z"; - const docs: Document[] = [ - { - pageContent: "Test document with etag and lastModified", - metadata: { - title: "ETag Test Doc", - url: "https://example.com/etag-test", - path: ["test"], - etag: testEtag, - lastModified: testLastModified, - }, - }, - ]; - - await store.addDocuments("etagtest", "1.0.0", docs); + await store.addDocuments( + "etagtest", + "1.0.0", + 1, + createScrapeResult( + "ETag Test Doc", + "https://example.com/etag-test", + "Test document with etag and lastModified", + ["test"], + { etag: testEtag, lastModified: testLastModified }, + ), + ); // Query the database directly to verify the etag and last_modified are stored // @ts-expect-error Accessing private property for testing @@ -311,43 +339,46 @@ describe("DocumentStore - With Embeddings", () => { expect(results.length).toBeGreaterThan(0); const doc = results[0]; - expect(doc.metadata.url).toBe("https://example.com/etag-test"); + expect(doc.url).toBe("https://example.com/etag-test"); }); }); describe("Hybrid Search with Embeddings", () => { beforeEach(async () => { // Set up test documents with known semantic relationships for ranking tests - const docs: Document[] = [ - { - pageContent: "JavaScript programming tutorial with code examples and functions", - metadata: { - title: "JavaScript Programming Guide", - url: "https://example.com/js-guide", - path: ["programming", "javascript"], - }, - }, - { - pageContent: - "Advanced JavaScript frameworks like React and Vue for building applications", - metadata: { - title: "JavaScript Frameworks", - url: "https://example.com/js-frameworks", - path: ["programming", "javascript", "frameworks"], - }, - }, - { - pageContent: - "Python programming language tutorial for data science and machine learning", - metadata: { - title: "Python Programming", - url: "https://example.com/python-guide", - path: ["programming", "python"], - }, - }, - ]; - - await store.addDocuments("searchtest", "1.0.0", docs); + await store.addDocuments( + "searchtest", + "1.0.0", + 1, + createScrapeResult( + "JavaScript Programming Guide", + "https://example.com/js-guide", + "JavaScript programming tutorial with code examples and functions", + ["programming", "javascript"], + ), + ); + await store.addDocuments( + "searchtest", + "1.0.0", + 1, + createScrapeResult( + "JavaScript Frameworks", + "https://example.com/js-frameworks", + "Advanced JavaScript frameworks like React and Vue for building applications", + ["programming", "javascript", "frameworks"], + ), + ); + await store.addDocuments( + "searchtest", + "1.0.0", + 1, + createScrapeResult( + "Python Programming", + "https://example.com/python-guide", + "Python programming language tutorial for data science and machine learning", + ["programming", "python"], + ), + ); }); it("should perform hybrid search combining vector and FTS", async () => { @@ -362,31 +393,28 @@ describe("DocumentStore - With Embeddings", () => { // JavaScript documents should rank higher than non-JavaScript documents const topResult = results[0]; - expect(topResult.pageContent.toLowerCase()).toContain("javascript"); + expect(topResult.content.toLowerCase()).toContain("javascript"); // Results should have both vector and FTS ranking metadata const hybridResults = results.filter( - (r) => r.metadata.vec_rank !== undefined && r.metadata.fts_rank !== undefined, + (r) => r.vec_rank !== undefined && r.fts_rank !== undefined, ); // At least some results should be hybrid matches if (hybridResults.length > 0) { for (const result of hybridResults) { - expect(result.metadata.vec_rank).toBeGreaterThan(0); - expect(result.metadata.fts_rank).toBeGreaterThan(0); - expect(result.metadata.score).toBeGreaterThan(0); + expect(result.vec_rank).toBeGreaterThan(0); + expect(result.fts_rank).toBeGreaterThan(0); + expect(result.score).toBeGreaterThan(0); } } // All results should have valid scores for (const result of results) { - expect(result.metadata.score).toBeGreaterThan(0); - expect(typeof result.metadata.score).toBe("number"); + expect(result.score).toBeGreaterThan(0); + expect(typeof result.score).toBe("number"); // Results should have either vec_rank, fts_rank, or both - expect( - result.metadata.vec_rank !== undefined || - result.metadata.fts_rank !== undefined, - ).toBe(true); + expect(result.vec_rank !== undefined || result.fts_rank !== undefined).toBe(true); } }); @@ -402,22 +430,22 @@ describe("DocumentStore - With Embeddings", () => { // Should find programming documents const programmingResults = results.filter((r) => - r.pageContent.toLowerCase().includes("programming"), + r.content.toLowerCase().includes("programming"), ); expect(programmingResults.length).toBeGreaterThan(0); // At least some results should have vector ranks (semantic/embedding matching) // If no vector results, it might be because embeddings were disabled in this test run - const vectorResults = results.filter((r) => r.metadata.vec_rank !== undefined); - const ftsResults = results.filter((r) => r.metadata.fts_rank !== undefined); + const vectorResults = results.filter((r) => r.vec_rank !== undefined); + const ftsResults = results.filter((r) => r.fts_rank !== undefined); // Either we have vector results (hybrid search) or FTS results (fallback) expect(vectorResults.length > 0 || ftsResults.length > 0).toBe(true); // All results should have valid scores for (const result of results) { - expect(result.metadata.score).toBeGreaterThan(0); + expect(result.score).toBeGreaterThan(0); } }); }); @@ -444,16 +472,19 @@ describe("DocumentStore - With Embeddings", () => { // Create 3 docs that fit 2 per batch by character size const contentSize = 24000; // 24KB each - const docs: Document[] = Array.from({ length: 3 }, (_, i) => ({ - pageContent: "x".repeat(contentSize), - metadata: { - title: `Doc ${i + 1}`, - url: `https://example.com/doc${i + 1}`, - path: ["section"], - }, - })); - - await store.addDocuments("testlib", "1.0.0", docs); + for (let i = 0; i < 3; i++) { + await store.addDocuments( + "testlib", + "1.0.0", + 1, + createScrapeResult( + `Doc ${i + 1}`, + `https://example.com/doc${i + 1}`, + "x".repeat(contentSize), + ["section"], + ), + ); + } // Should create 2 batches - first with 2 docs, second with 1 doc expect(mockEmbedDocuments).toHaveBeenCalledTimes(2); @@ -468,18 +499,16 @@ describe("DocumentStore - With Embeddings", () => { return; } - const docs: Document[] = [ - { - pageContent: "Test content", - metadata: { - title: "Test Title", - url: "https://example.com/test", - path: ["path", "to", "doc"], - }, - }, - ]; - - await store.addDocuments("testlib", "1.0.0", docs); + await store.addDocuments( + "testlib", + "1.0.0", + 1, + createScrapeResult("Test Title", "https://example.com/test", "Test content", [ + "path", + "to", + "doc", + ]), + ); // Embedding text should include structured metadata expect(mockEmbedDocuments).toHaveBeenCalledTimes(1); @@ -494,18 +523,17 @@ describe("DocumentStore - With Embeddings", () => { describe("Status Tracking and Metadata", () => { it("should update version status correctly", async () => { - const docs: Document[] = [ - { - pageContent: "Status tracking test content", - metadata: { - title: "Status Test", - url: "https://example.com/status-test", - path: ["test"], - }, - }, - ]; - - await store.addDocuments("statuslib", "1.0.0", docs); + await store.addDocuments( + "statuslib", + "1.0.0", + 1, + createScrapeResult( + "Status Test", + "https://example.com/status-test", + "Status tracking test content", + ["test"], + ), + ); const versionId = await store.resolveVersionId("statuslib", "1.0.0"); await store.updateVersionStatus(versionId, VersionStatus.QUEUED); @@ -579,19 +607,18 @@ describe("DocumentStore - Without Embeddings (FTS-only)", () => { store = new DocumentStore(":memory:"); await store.initialize(); - const testDocuments: Document[] = [ - { - pageContent: "This is a test document about React hooks.", - metadata: { - url: "https://example.com/react-hooks", - title: "React Hooks Guide", - path: ["React", "Hooks"], - }, - }, - ]; - await expect( - store.addDocuments("react", "18.0.0", testDocuments), + store.addDocuments( + "react", + "18.0.0", + 1, + createScrapeResult( + "React Hooks Guide", + "https://example.com/react-hooks", + "This is a test document about React hooks.", + ["React", "Hooks"], + ), + ), ).resolves.not.toThrow(); const exists = await store.checkDocumentExists("react", "18.0.0"); @@ -604,43 +631,45 @@ describe("DocumentStore - Without Embeddings (FTS-only)", () => { store = new DocumentStore(":memory:"); await store.initialize(); - const testDocuments: Document[] = [ - { - pageContent: "React hooks are a powerful feature for state management.", - metadata: { - url: "https://example.com/react-hooks", - title: "React Hooks Guide", - path: ["React", "Hooks"], - }, - }, - { - pageContent: "TypeScript provides excellent type safety for JavaScript.", - metadata: { - url: "https://example.com/typescript-intro", - title: "TypeScript Introduction", - path: ["TypeScript", "Intro"], - }, - }, - ]; - - await store.addDocuments("testlib", "1.0.0", testDocuments); + await store.addDocuments( + "testlib", + "1.0.0", + 1, + createScrapeResult( + "React Hooks Guide", + "https://example.com/react-hooks", + "React hooks are a powerful feature for state management.", + ["React", "Hooks"], + ), + ); + await store.addDocuments( + "testlib", + "1.0.0", + 1, + createScrapeResult( + "TypeScript Introduction", + "https://example.com/typescript-intro", + "TypeScript provides excellent type safety for JavaScript.", + ["TypeScript", "Intro"], + ), + ); }); it("should perform FTS-only search", async () => { const results = await store.findByContent("testlib", "1.0.0", "React hooks", 5); expect(results.length).toBeGreaterThan(0); - expect(results[0].pageContent).toContain("React hooks"); - expect(results[0].metadata).toHaveProperty("score"); - expect(results[0].metadata).toHaveProperty("fts_rank"); + expect(results[0].content).toContain("React hooks"); + expect(results[0]).toHaveProperty("score"); + expect(results[0]).toHaveProperty("fts_rank"); // Should NOT have vector rank since vectorization is disabled - expect(results[0].metadata.vec_rank).toBeUndefined(); + expect((results[0] as any).vec_rank).toBeUndefined(); }); it("should handle various search queries correctly", async () => { const jsResults = await store.findByContent("testlib", "1.0.0", "TypeScript", 5); expect(jsResults.length).toBeGreaterThan(0); - expect(jsResults[0].pageContent).toContain("TypeScript"); + expect(jsResults[0].content).toContain("TypeScript"); // Empty query should return empty results const emptyResults = await store.findByContent("testlib", "1.0.0", "", 5); @@ -706,64 +735,61 @@ describe("DocumentStore - Common Functionality", () => { describe("Version Isolation", () => { it("should search within specific versions only", async () => { - const docsV1: Document[] = [ - { - pageContent: "Old feature documentation", - metadata: { - title: "Old Feature", - url: "https://example.com/old", - path: ["features"], - }, - }, - ]; - - const docsV2: Document[] = [ - { - pageContent: "New feature documentation", - metadata: { - title: "New Feature", - url: "https://example.com/new", - path: ["features"], - }, - }, - ]; - - await store.addDocuments("featuretest", "1.0.0", docsV1); - await store.addDocuments("featuretest", "2.0.0", docsV2); + await store.addDocuments( + "featuretest", + "1.0.0", + 1, + createScrapeResult( + "Old Feature", + "https://example.com/old", + "Old feature documentation", + ["features"], + ), + ); + await store.addDocuments( + "featuretest", + "2.0.0", + 1, + createScrapeResult( + "New Feature", + "https://example.com/new", + "New feature documentation", + ["features"], + ), + ); const v1Results = await store.findByContent("featuretest", "1.0.0", "feature", 10); expect(v1Results.length).toBeGreaterThan(0); - expect(v1Results[0].metadata.title).toBe("Old Feature"); + expect(v1Results[0].title).toBe("Old Feature"); const v2Results = await store.findByContent("featuretest", "2.0.0", "feature", 10); expect(v2Results.length).toBeGreaterThan(0); - expect(v2Results[0].metadata.title).toBe("New Feature"); + expect(v2Results[0].title).toBe("New Feature"); }); }); describe("Document Management", () => { it("should retrieve documents by ID", async () => { - const docs: Document[] = [ - { - pageContent: "Test document for ID retrieval", - metadata: { - title: "ID Test Doc", - url: "https://example.com/id-test", - path: ["test"], - }, - }, - ]; - - await store.addDocuments("idtest", "1.0.0", docs); + await store.addDocuments( + "idtest", + "1.0.0", + 1, + createScrapeResult( + "ID Test Doc", + "https://example.com/id-test", + "Test document for ID retrieval", + ["test"], + ), + ); const results = await store.findByContent("idtest", "1.0.0", "test document", 10); expect(results.length).toBeGreaterThan(0); const doc = results[0]; - expect(doc.metadata.id).toBeDefined(); + expect(doc.id).toBeDefined(); - const retrievedDoc = await store.getById(doc.metadata.id); + const retrievedDoc = await store.getById(doc.id); expect(retrievedDoc).not.toBeNull(); - expect(retrievedDoc?.metadata.title).toBe("ID Test Doc"); + expect(retrievedDoc?.title).toBe("ID Test Doc"); }); it("should handle URL pre-deletion correctly", async () => { @@ -794,39 +820,50 @@ describe("DocumentStore - Common Functionality", () => { return result.count; } - // Add initial documents - const initialDocs: Document[] = [ - { - pageContent: "Initial content chunk 1", - metadata: { url, title: "Initial Test Page", path: ["section1"] }, - }, - { - pageContent: "Initial content chunk 2", - metadata: { url, title: "Initial Test Page", path: ["section2"] }, - }, - ]; - - await store.addDocuments(library, version, initialDocs); + // Add initial page with 2 chunks + await store.addDocuments(library, version, 1, { + ...createScrapeResult("Initial Test Page", url, "Initial content chunk 1", [ + "section1", + ]), + chunks: [ + { + types: ["text"], + content: "Initial content chunk 1", + section: { level: 0, path: ["section1"] }, + }, + { + types: ["text"], + content: "Initial content chunk 2", + section: { level: 0, path: ["section2"] }, + }, + ], + }); expect(await countDocuments()).toBe(2); expect(await countDocuments(url)).toBe(2); - // Update with new documents (should trigger pre-deletion) - const updatedDocs: Document[] = [ - { - pageContent: "Updated content chunk 1", - metadata: { url, title: "Updated Test Page", path: ["updated-section1"] }, - }, - { - pageContent: "Updated content chunk 2", - metadata: { url, title: "Updated Test Page", path: ["updated-section2"] }, - }, - { - pageContent: "Updated content chunk 3", - metadata: { url, title: "Updated Test Page", path: ["updated-section3"] }, - }, - ]; - - await store.addDocuments(library, version, updatedDocs); + // Update with new page (should trigger pre-deletion) + await store.addDocuments(library, version, 1, { + ...createScrapeResult("Updated Test Page", url, "Updated content chunk 1", [ + "updated-section1", + ]), + chunks: [ + { + types: ["text"], + content: "Updated content chunk 1", + section: { level: 0, path: ["updated-section1"] }, + }, + { + types: ["text"], + content: "Updated content chunk 2", + section: { level: 0, path: ["updated-section2"] }, + }, + { + types: ["text"], + content: "Updated content chunk 3", + section: { level: 0, path: ["updated-section3"] }, + }, + ], + }); expect(await countDocuments()).toBe(3); expect(await countDocuments(url)).toBe(3); }); @@ -834,18 +871,17 @@ describe("DocumentStore - Common Functionality", () => { describe("Search Security", () => { beforeEach(async () => { - const docs: Document[] = [ - { - pageContent: "Programming computers is fun and educational for developers", - metadata: { - title: "Programming Guide", - url: "https://example.com/programming", - path: ["programming", "guide"], - }, - }, - ]; - - await store.addDocuments("security-test", "1.0.0", docs); + await store.addDocuments( + "security-test", + "1.0.0", + 1, + createScrapeResult( + "Programming Guide", + "https://example.com/programming", + "Programming computers is fun and educational for developers", + ["programming", "guide"], + ), + ); }); it("should safely handle malicious queries", async () => { diff --git a/src/store/DocumentStore.ts b/src/store/DocumentStore.ts index 67c7bdc3..d23b1767 100644 --- a/src/store/DocumentStore.ts +++ b/src/store/DocumentStore.ts @@ -1,10 +1,8 @@ -import type { Document } from "@langchain/core/documents"; import type { Embeddings } from "@langchain/core/embeddings"; import Database, { type Database as DatabaseType } from "better-sqlite3"; import semver from "semver"; import * as sqliteVec from "sqlite-vec"; -import type { ScraperOptions } from "../scraper/types"; -import type { DocumentMetadata } from "../types"; +import type { ScrapeResult, ScraperOptions } from "../scraper/types"; import { EMBEDDING_BATCH_CHARS, EMBEDDING_BATCH_SIZE, @@ -23,22 +21,23 @@ import { UnsupportedProviderError, } from "./embeddings/EmbeddingFactory"; import { ConnectionError, DimensionError, StoreError } from "./errors"; -import type { StoredScraperOptions } from "./types"; +import type { DbChunkMetadata, DbChunkRank, StoredScraperOptions } from "./types"; import { - type DbDocument, - type DbJoinedDocument, + type DbChunk, + type DbLibraryVersion, + type DbPage, + type DbPageChunk, type DbQueryResult, type DbVersion, type DbVersionWithLibrary, denormalizeVersionName, - mapDbDocumentToDocument, normalizeVersionName, VECTOR_DIMENSION, type VersionScraperOptions, type VersionStatus, } from "./types"; -interface RawSearchResult extends DbDocument { +interface RawSearchResult extends DbChunk { // Page fields joined from pages table url?: string; title?: string; @@ -75,7 +74,7 @@ export class DocumentStore { insertEmbedding: Database.Statement<[bigint, string]>; // New statement for pages table insertPage: Database.Statement< - [number, string, string, string | null, string | null, string | null] + [number, string, string, string | null, string | null, string | null, number | null] >; getPageId: Database.Statement<[number, string]>; deleteDocuments: Database.Statement<[string, string]>; @@ -186,7 +185,7 @@ export class DocumentStore { private prepareStatements(): void { const statements = { getById: this.db.prepare<[bigint]>( - `SELECT d.*, p.url, p.title, p.content_type + `SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d JOIN pages p ON d.page_id = p.id WHERE d.id = ?`, @@ -199,9 +198,17 @@ export class DocumentStore { "UPDATE documents SET embedding = ? WHERE id = ?", ), insertPage: this.db.prepare< - [number, string, string, string | null, string | null, string | null] + [ + number, + string, + string, + string | null, + string | null, + string | null, + number | null, + ] >( - "INSERT INTO pages (version_id, url, title, etag, last_modified, content_type) VALUES (?, ?, ?, ?, ?, ?) ON CONFLICT(version_id, url) DO UPDATE SET title = excluded.title, content_type = excluded.content_type", + "INSERT INTO pages (version_id, url, title, etag, last_modified, content_type, depth) VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(version_id, url) DO UPDATE SET title = excluded.title, content_type = excluded.content_type, etag = excluded.etag, last_modified = excluded.last_modified, depth = excluded.depth", ), getPageId: this.db.prepare<[number, string]>( "SELECT id FROM pages WHERE version_id = ? AND url = ?", @@ -301,7 +308,7 @@ export class DocumentStore { getChildChunks: this.db.prepare< [string, string, string, number, string, bigint, number] >(` - SELECT d.*, p.url, p.title, p.content_type FROM documents d + SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d JOIN pages p ON d.page_id = p.id JOIN versions v ON p.version_id = v.id JOIN libraries l ON v.library_id = l.id @@ -317,7 +324,7 @@ export class DocumentStore { getPrecedingSiblings: this.db.prepare< [string, string, string, bigint, string, number] >(` - SELECT d.*, p.url, p.title, p.content_type FROM documents d + SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d JOIN pages p ON d.page_id = p.id JOIN versions v ON p.version_id = v.id JOIN libraries l ON v.library_id = l.id @@ -332,7 +339,7 @@ export class DocumentStore { getSubsequentSiblings: this.db.prepare< [string, string, string, bigint, string, number] >(` - SELECT d.*, p.url, p.title, p.content_type FROM documents d + SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d JOIN pages p ON d.page_id = p.id JOIN versions v ON p.version_id = v.id JOIN libraries l ON v.library_id = l.id @@ -345,7 +352,7 @@ export class DocumentStore { LIMIT ? `), getParentChunk: this.db.prepare<[string, string, string, string, bigint]>(` - SELECT d.*, p.url, p.title, p.content_type FROM documents d + SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d JOIN pages p ON d.page_id = p.id JOIN versions v ON p.version_id = v.id JOIN libraries l ON v.library_id = l.id @@ -389,7 +396,7 @@ export class DocumentStore { WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')`, ), getPagesByVersionId: this.db.prepare<[number]>( - "SELECT id, url, etag FROM pages WHERE version_id = ?", + "SELECT * FROM pages WHERE version_id = ?", ), }; this.statements = statements; @@ -773,21 +780,7 @@ export class DocumentStore { > > { try { - // Define the expected row structure from the GROUP BY query (including versions without documents) - interface LibraryVersionRow { - library: string; - version: string; - versionId: number; - status: VersionStatus; - progressPages: number; - progressMaxPages: number; - sourceUrl: string | null; - documentCount: number; - uniqueUrlCount: number; - indexedAt: string | null; // MIN() may return null - } - - const rows = this.statements.queryLibraryVersions.all() as LibraryVersionRow[]; + const rows = this.statements.queryLibraryVersions.all() as DbLibraryVersion[]; const libraryMap = new Map< string, Array<{ @@ -863,34 +856,22 @@ export class DocumentStore { async addDocuments( library: string, version: string, - documents: Document[], + depth: number, + result: ScrapeResult, ): Promise { try { - if (documents.length === 0) { + const { title, url, chunks } = result; + if (chunks.length === 0) { return; } - // Group documents by URL to create pages - const documentsByUrl = new Map(); - for (const doc of documents) { - const url = doc.metadata.url as string; - if (!url || typeof url !== "string" || !url.trim()) { - throw new StoreError("Document metadata must include a valid URL"); - } - - if (!documentsByUrl.has(url)) { - documentsByUrl.set(url, []); - } - documentsByUrl.get(url)?.push(doc); - } - // Generate embeddings in batch only if vector search is enabled let paddedEmbeddings: number[][] = []; if (this.isVectorSearchEnabled) { - const texts = documents.map((doc) => { - const header = `${doc.metadata.title}\n${doc.metadata.url}\n${(doc.metadata.path || []).join(" / ")}\n`; - return `${header}${doc.pageContent}`; + const texts = chunks.map((chunk) => { + const header = `${title}\n${url}\n${(chunk.section.path || []).join(" / ")}\n`; + return `${header}${chunk.content}`; }); // Batch embedding creation to avoid token limit errors @@ -949,95 +930,73 @@ export class DocumentStore { const versionId = await this.resolveVersionId(library, version); // Delete existing documents for these URLs to prevent conflicts - for (const url of documentsByUrl.keys()) { - const deletedCount = await this.deleteDocumentsByUrl(library, version, url); - if (deletedCount > 0) { - logger.debug(`Deleted ${deletedCount} existing documents for URL: ${url}`); - } + const deletedCount = await this.deleteDocumentsByUrl(library, version, url); + if (deletedCount > 0) { + logger.debug(`Deleted ${deletedCount} existing documents for URL: ${url}`); } // Insert documents in a transaction - const transaction = this.db.transaction((docsByUrl: Map) => { - // First, create or update pages for each unique URL - const pageIds = new Map(); - - for (const [url, urlDocs] of docsByUrl) { - // Use the first document's metadata for page-level data - const firstDoc = urlDocs[0]; - const title = firstDoc.metadata.title || ""; - // Extract content type from metadata if available - const contentType = firstDoc.metadata.contentType || null; - - // Extract etag from document metadata if available - const etag = firstDoc.metadata.etag || null; - - // Extract lastModified from document metadata if available - const lastModified = firstDoc.metadata.lastModified || null; - - // Insert or update page record - this.statements.insertPage.run( - versionId, - url, - title, - etag, - lastModified, - contentType, - ); - - // Query for the page ID since we can't use RETURNING - const existingPage = this.statements.getPageId.get(versionId, url) as - | { id: number } - | undefined; - if (!existingPage) { - throw new StoreError(`Failed to get page ID for URL: ${url}`); - } - const pageId = existingPage.id; - pageIds.set(url, pageId); + const transaction = this.db.transaction(() => { + // Extract content type from metadata if available + const contentType = result.contentType || null; + + // Extract etag from document metadata if available + const etag = result.etag || null; + + // Extract lastModified from document metadata if available + const lastModified = result.lastModified || null; + + // Insert or update page record + this.statements.insertPage.run( + versionId, + url, + title || "", + etag, + lastModified, + contentType, + depth, + ); + + // Query for the page ID since we can't use RETURNING + const existingPage = this.statements.getPageId.get(versionId, url) as + | { id: number } + | undefined; + if (!existingPage) { + throw new StoreError(`Failed to get page ID for URL: ${url}`); } + const pageId = existingPage.id; // Then insert document chunks linked to their pages let docIndex = 0; - for (const [url, urlDocs] of docsByUrl) { - const pageId = pageIds.get(url); - if (!pageId) { - throw new StoreError(`Failed to get page ID for URL: ${url}`); - } + for (let i = 0; i < chunks.length; i++) { + const chunk = chunks[i]; + + // Insert document chunk + const result = this.statements.insertDocument.run( + pageId, + chunk.content, + JSON.stringify({ + types: chunk.types, + level: chunk.section.level, + path: chunk.section.path, + } satisfies DbChunkMetadata), + i, // sort_order within this page + ); + const rowId = result.lastInsertRowid; - for (let i = 0; i < urlDocs.length; i++) { - const doc = urlDocs[i]; - - // Create chunk-specific metadata (remove page-level fields) - const { - url: _, - title: __, - library: ___, - version: ____, - ...chunkMetadata - } = doc.metadata; - - // Insert document chunk - const result = this.statements.insertDocument.run( - pageId, - doc.pageContent, - JSON.stringify(chunkMetadata), - i, // sort_order within this page + // Insert into vector table only if vector search is enabled + if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) { + this.statements.insertEmbedding.run( + BigInt(rowId), + JSON.stringify(paddedEmbeddings[docIndex]), ); - const rowId = result.lastInsertRowid; - - // Insert into vector table only if vector search is enabled - if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) { - this.statements.insertEmbedding.run( - BigInt(rowId), - JSON.stringify(paddedEmbeddings[docIndex]), - ); - } - - docIndex++; } + + docIndex++; } }); - transaction(documentsByUrl); + transaction(); } catch (error) { throw new ConnectionError("Failed to add documents to store", error); } @@ -1098,18 +1057,12 @@ export class DocumentStore { /** * Retrieves all pages for a specific version ID with their metadata. - * Used for refresh operations to get existing pages with their ETags. + * Used for refresh operations to get existing pages with their ETags and depths. * @returns Array of page records */ - async getPagesByVersionId( - versionId: number, - ): Promise> { + async getPagesByVersionId(versionId: number): Promise { try { - const result = this.statements.getPagesByVersionId.all(versionId) as Array<{ - id: number; - url: string; - etag: string | null; - }>; + const result = this.statements.getPagesByVersionId.all(versionId) as DbPage[]; return result; } catch (error) { throw new ConnectionError("Failed to get pages by version ID", error); @@ -1189,21 +1142,42 @@ export class DocumentStore { } } + /** + * Parses the metadata field from a JSON string to an object. + * This is necessary because better-sqlite3's json() function returns a string, not an object. + */ + private parseMetadata(row: T): T { + if (row.metadata && typeof row.metadata === "string") { + try { + row.metadata = JSON.parse(row.metadata); + } catch (error) { + logger.warn(`Failed to parse metadata JSON: ${error}`); + row.metadata = {} as M; + } + } + return row; + } + + /** + * Parses metadata for an array of rows. + */ + private parseMetadataArray(rows: T[]): T[] { + return rows.map((row) => this.parseMetadata(row)); + } + /** * Retrieves a document by its ID. * @param id The ID of the document. * @returns The document, or null if not found. */ - async getById(id: string): Promise { + async getById(id: string): Promise { try { - const row = this.statements.getById.get( - BigInt(id), - ) as DbQueryResult; + const row = this.statements.getById.get(BigInt(id)) as DbQueryResult; if (!row) { return null; } - return mapDbDocumentToDocument(row); + return this.parseMetadata(row); } catch (error) { throw new ConnectionError(`Failed to get document by ID ${id}`, error); } @@ -1219,7 +1193,7 @@ export class DocumentStore { version: string, query: string, limit: number, - ): Promise { + ): Promise<(DbPageChunk & DbChunkRank)[]> { try { // Return empty array for empty or whitespace-only queries if (!query || typeof query !== "string" || query.trim().length === 0) { @@ -1310,25 +1284,20 @@ export class DocumentStore { .sort((a, b) => b.rrf_score - a.rrf_score) .slice(0, limit); - return topResults.map((row) => ({ - ...mapDbDocumentToDocument({ + return topResults.map((row) => { + const result: DbPageChunk = { ...row, url: row.url || "", // Ensure url is never undefined - title: row.title, - content_type: row.content_type, - } as DbJoinedDocument), - metadata: { - ...JSON.parse(row.metadata), - id: row.id, + title: row.title || null, + content_type: row.content_type || null, + }; + // Add search scores as additional properties (not in metadata) + return Object.assign(result, { score: row.rrf_score, vec_rank: row.vec_rank, fts_rank: row.fts_rank, - // Explicitly add page fields if they exist - url: row.url || "", - title: row.title || "", - ...(row.content_type && { contentType: row.content_type }), - }, - })); + }); + }); } else { // Fallback: full-text search only const stmt = this.db.prepare(` @@ -1364,25 +1333,19 @@ export class DocumentStore { ) as (RawSearchResult & { fts_score: number })[]; // Assign FTS ranks based on order (best score = rank 1) - return rawResults.map((row, index) => ({ - ...mapDbDocumentToDocument({ + return rawResults.map((row, index) => { + const result: DbPageChunk = { ...row, url: row.url || "", // Ensure url is never undefined - title: row.title, - content_type: row.content_type, - } as DbJoinedDocument), - metadata: { - ...JSON.parse(row.metadata), - id: row.id, + title: row.title || null, + content_type: row.content_type || null, + }; + // Add search scores as additional properties (not in metadata) + return Object.assign(result, { score: -row.fts_score, // Convert BM25 score to positive value for consistency fts_rank: index + 1, // Assign rank based on order (1-based) - // Explicitly ensure vec_rank is not included in FTS-only mode - // Explicitly add page fields - url: row.url || "", - title: row.title || "", - ...(row.content_type && { contentType: row.content_type }), - }, - })); + }); + }); } } catch (error) { throw new ConnectionError( @@ -1400,28 +1363,27 @@ export class DocumentStore { version: string, id: string, limit: number, - ): Promise { + ): Promise { try { const parent = await this.getById(id); if (!parent) { return []; } - const parentPath = (parent.metadata as DocumentMetadata).path ?? []; - const parentUrl = (parent.metadata as DocumentMetadata).url; + const parentPath = parent.metadata.path ?? []; const normalizedVersion = version.toLowerCase(); const result = this.statements.getChildChunks.all( library.toLowerCase(), normalizedVersion, - parentUrl, + parent.url, parentPath.length + 1, JSON.stringify(parentPath), BigInt(id), limit, - ) as Array; + ) as Array; - return result.map((row) => mapDbDocumentToDocument(row)); + return this.parseMetadataArray(result); } catch (error) { throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error); } @@ -1435,26 +1397,25 @@ export class DocumentStore { version: string, id: string, limit: number, - ): Promise { + ): Promise { try { const reference = await this.getById(id); if (!reference) { return []; } - const refMetadata = reference.metadata as DocumentMetadata; const normalizedVersion = version.toLowerCase(); const result = this.statements.getPrecedingSiblings.all( library.toLowerCase(), normalizedVersion, - refMetadata.url, + reference.url, BigInt(id), - JSON.stringify(refMetadata.path), + JSON.stringify(reference.metadata.path), limit, - ) as Array; + ) as Array; - return result.reverse().map((row) => mapDbDocumentToDocument(row)); + return this.parseMetadataArray(result).reverse(); } catch (error) { throw new ConnectionError( `Failed to find preceding sibling chunks for ID ${id}`, @@ -1471,26 +1432,25 @@ export class DocumentStore { version: string, id: string, limit: number, - ): Promise { + ): Promise { try { const reference = await this.getById(id); if (!reference) { return []; } - const refMetadata = reference.metadata; const normalizedVersion = version.toLowerCase(); const result = this.statements.getSubsequentSiblings.all( library.toLowerCase(), normalizedVersion, - refMetadata.url, + reference.url, BigInt(id), - JSON.stringify(refMetadata.path), + JSON.stringify(reference.metadata.path), limit, - ) as Array; + ) as Array; - return result.map((row) => mapDbDocumentToDocument(row)); + return this.parseMetadataArray(result); } catch (error) { throw new ConnectionError( `Failed to find subsequent sibling chunks for ID ${id}`, @@ -1506,15 +1466,14 @@ export class DocumentStore { library: string, version: string, id: string, - ): Promise { + ): Promise { try { const child = await this.getById(id); if (!child) { return null; } - const childMetadata = child.metadata as DocumentMetadata; - const path = childMetadata.path ?? []; + const path = child.metadata.path ?? []; const parentPath = path.slice(0, -1); if (parentPath.length === 0) { @@ -1525,16 +1484,16 @@ export class DocumentStore { const result = this.statements.getParentChunk.get( library.toLowerCase(), normalizedVersion, - childMetadata.url, + child.url, JSON.stringify(parentPath), BigInt(id), - ) as DbQueryResult; + ) as DbQueryResult; if (!result) { return null; } - return mapDbDocumentToDocument(result); + return this.parseMetadata(result); } catch (error) { throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error); } @@ -1542,20 +1501,20 @@ export class DocumentStore { /** * Fetches multiple documents by their IDs in a single call. - * Returns an array of Document objects, sorted by their sort_order. + * Returns an array of DbPageChunk objects, sorted by their sort_order. */ async findChunksByIds( library: string, version: string, ids: string[], - ): Promise { + ): Promise { if (!ids.length) return []; try { const normalizedVersion = version.toLowerCase(); // Use parameterized query for variable number of IDs const placeholders = ids.map(() => "?").join(","); const stmt = this.db.prepare( - `SELECT d.*, p.url, p.title, p.content_type FROM documents d + `SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d JOIN pages p ON d.page_id = p.id JOIN versions v ON p.version_id = v.id JOIN libraries l ON v.library_id = l.id @@ -1568,8 +1527,8 @@ export class DocumentStore { library.toLowerCase(), normalizedVersion, ...ids, - ) as DbJoinedDocument[]; - return rows.map((row) => mapDbDocumentToDocument(row)); + ) as DbPageChunk[]; + return this.parseMetadataArray(rows); } catch (error) { throw new ConnectionError("Failed to fetch documents by IDs", error); } @@ -1577,17 +1536,17 @@ export class DocumentStore { /** * Fetches all document chunks for a specific URL within a library and version. - * Returns documents sorted by their sort_order for proper reassembly. + * Returns DbPageChunk objects sorted by their sort_order for proper reassembly. */ async findChunksByUrl( library: string, version: string, url: string, - ): Promise { + ): Promise { try { const normalizedVersion = version.toLowerCase(); const stmt = this.db.prepare( - `SELECT d.*, p.url, p.title, p.content_type FROM documents d + `SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d JOIN pages p ON d.page_id = p.id JOIN versions v ON p.version_id = v.id JOIN libraries l ON v.library_id = l.id @@ -1600,8 +1559,8 @@ export class DocumentStore { library.toLowerCase(), normalizedVersion, url, - ) as DbJoinedDocument[]; - return rows.map((row) => mapDbDocumentToDocument(row)); + ) as DbPageChunk[]; + return this.parseMetadataArray(rows); } catch (error) { throw new ConnectionError(`Failed to fetch documents by URL ${url}`, error); } diff --git a/src/store/assembly/ContentAssemblyStrategyFactory.ts b/src/store/assembly/ContentAssemblyStrategyFactory.ts index 5235a1ee..c49f88c2 100644 --- a/src/store/assembly/ContentAssemblyStrategyFactory.ts +++ b/src/store/assembly/ContentAssemblyStrategyFactory.ts @@ -9,7 +9,7 @@ import type { ContentAssemblyStrategy } from "./types"; * @returns The appropriate strategy instance */ export function createContentAssemblyStrategy( - mimeType?: string, + mimeType?: string | null, ): ContentAssemblyStrategy { // Default to MarkdownAssemblyStrategy for unknown or missing MIME types if (!mimeType) { diff --git a/src/store/assembly/strategies/HierarchicalAssemblyStrategy.test.ts b/src/store/assembly/strategies/HierarchicalAssemblyStrategy.test.ts index fdbd957a..a76c3678 100644 --- a/src/store/assembly/strategies/HierarchicalAssemblyStrategy.test.ts +++ b/src/store/assembly/strategies/HierarchicalAssemblyStrategy.test.ts @@ -1,6 +1,6 @@ -import type { Document } from "@langchain/core/documents"; import { afterEach, beforeEach, describe, expect, it } from "vitest"; import { DocumentStore } from "../../DocumentStore"; +import type { DbChunkMetadata, DbPageChunk } from "../../types"; import { HierarchicalAssemblyStrategy } from "./HierarchicalAssemblyStrategy"; describe("HierarchicalAssemblyStrategy", () => { @@ -46,72 +46,56 @@ describe("HierarchicalAssemblyStrategy", () => { }); it("should reconstruct complete hierarchy for single match", async () => { - const versionId = await documentStore.resolveVersionId("test-hierarchy", "1.0"); - - expect(versionId).toBeGreaterThan(0); + // Use the public API to add documents + await documentStore.addDocuments("test-hierarchy", "1.0", 0, { + url: "Deep.ts", + title: "Deep TypeScript File", + contentType: "text/typescript", + textContent: "", + chunks: [ + { + content: "namespace UserManagement {", + section: { + path: ["UserManagement"], + level: 0, + }, + types: ["structural"], + }, + { + content: " export class UserService {", + section: { + path: ["UserManagement", "UserService"], + level: 1, + }, + types: ["structural"], + }, + { + content: " getUserById(id: string) { return db.find(id); }", + section: { + path: ["UserManagement", "UserService", "getUserById"], + level: 2, + }, + types: ["text"], + }, + ], + links: [], + errors: [], + }); - // Create a page first - const pageResult = (documentStore as any).statements.insertPage.run( - versionId, + // Query the database to get the actual document IDs + const allChunks = await documentStore.findChunksByUrl( + "test-hierarchy", + "1.0", "Deep.ts", - "Deep TypeScript File", - null, - null, - "text/typescript", ); - const pageId = pageResult.lastInsertRowid; + expect(allChunks.length).toBe(3); - // Create a hierarchy: namespace > class > method - const namespaceResult = (documentStore as any).statements.insertDocument.run( - pageId, - "namespace UserManagement {", - JSON.stringify({ - url: "Deep.ts", - path: ["UserManagement"], - level: 0, - types: ["structural"], - }), - 0, - ); - const namespaceId = namespaceResult.lastInsertRowid; - - const classResult = (documentStore as any).statements.insertDocument.run( - pageId, - " export class UserService {", - JSON.stringify({ - url: "Deep.ts", - path: ["UserManagement", "UserService"], - level: 1, - types: ["structural"], - }), - 1, - ); - const classId = classResult.lastInsertRowid; - - const methodResult = (documentStore as any).statements.insertDocument.run( - pageId, - " getUserById(id: string) { return db.find(id); }", - JSON.stringify({ - url: "Deep.ts", - path: ["UserManagement", "UserService", "getUserById"], - level: 2, - types: ["content"], - }), - 2, - ); - const methodId = methodResult.lastInsertRowid; + const namespaceId = allChunks[0].id; + const classId = allChunks[1].id; + const methodId = allChunks[2].id; // Input: just the deeply nested method - const inputDoc: Document = { - id: methodId, - pageContent: " getUserById(id: string) { return db.find(id); }", - metadata: { - url: "Deep.ts", - path: ["UserManagement", "UserService", "getUserById"], - level: 2, - types: ["content"], - }, - }; + const inputDoc = allChunks[2]; const result = await strategy.selectChunks( "test-hierarchy", @@ -120,7 +104,7 @@ describe("HierarchicalAssemblyStrategy", () => { documentStore, ); - const resultContent = result.map((doc) => doc.pageContent); + const resultContent = result.map((doc) => doc.content); const resultIds = result.map((doc) => doc.id); // Should include the complete hierarchy: method + class + namespace @@ -138,62 +122,49 @@ describe("HierarchicalAssemblyStrategy", () => { }); it("should handle hierarchical gaps in parent chain", async () => { - const versionId = await documentStore.resolveVersionId("test-gaps", "1.0"); - - expect(versionId).toBeGreaterThan(0); + // Use the public API to add documents with a gap in the hierarchy + await documentStore.addDocuments("test-gaps", "1.0", 0, { + url: "GapTest.ts", + title: "Gap Test TypeScript File", + contentType: "text/typescript", + textContent: "", + chunks: [ + { + content: "namespace UserManagement {", + section: { + path: ["UserManagement"], + level: 0, + }, + types: ["structural"], + }, + // Intermediate class is missing (gap in hierarchy) + // No chunk with path: ["UserManagement", "UserService"] + { + content: " getUserById(id: string) { return db.find(id); }", + section: { + path: ["UserManagement", "UserService", "getUserById"], + level: 2, + }, + types: ["text"], + }, + ], + links: [], + errors: [], + }); - // Create a page first - const pageResult = (documentStore as any).statements.insertPage.run( - versionId, + // Query the database to get the actual document IDs + const allChunks = await documentStore.findChunksByUrl( + "test-gaps", + "1.0", "GapTest.ts", - "Gap Test TypeScript File", - null, - null, - "text/typescript", ); - const pageId = pageResult.lastInsertRowid; + expect(allChunks.length).toBe(2); - // Root namespace - exists - const namespaceResult = (documentStore as any).statements.insertDocument.run( - pageId, - "namespace UserManagement {", - JSON.stringify({ - url: "GapTest.ts", - path: ["UserManagement"], - level: 0, - types: ["structural"], - }), - 0, - ); - const namespaceId = namespaceResult.lastInsertRowid; + const namespaceId = allChunks[0].id; + const methodId = allChunks[1].id; - // Intermediate class - missing (gap in hierarchy) - // No chunk with path: ["UserManagement", "UserService"] - - // Deep method with missing intermediate parent - const methodResult = (documentStore as any).statements.insertDocument.run( - pageId, - " getUserById(id: string) { return db.find(id); }", - JSON.stringify({ - url: "GapTest.ts", - path: ["UserManagement", "UserService", "getUserById"], - level: 2, - types: ["content"], - }), - 1, - ); - const methodId = methodResult.lastInsertRowid; - - const inputDoc: Document = { - id: methodId, - pageContent: " getUserById(id: string) { return db.find(id); }", - metadata: { - url: "GapTest.ts", - path: ["UserManagement", "UserService", "getUserById"], - level: 2, - types: ["content"], - }, - }; + // Input: just the deeply nested method (with missing intermediate parent) + const inputDoc = allChunks[1]; const result = await strategy.selectChunks( "test-gaps", @@ -202,7 +173,7 @@ describe("HierarchicalAssemblyStrategy", () => { documentStore, ); - const resultContent = result.map((doc) => doc.pageContent); + const resultContent = result.map((doc) => doc.content); const resultIds = result.map((doc) => doc.id); // Should include the matched method and find the root namespace despite the gap @@ -216,61 +187,48 @@ describe("HierarchicalAssemblyStrategy", () => { }); it("should promote deeply nested anonymous functions to their top-level container", async () => { - const versionId = await documentStore.resolveVersionId("test-promotion", "1.0"); - - expect(versionId).toBeGreaterThan(0); + // Use the public API to add documents with nested anonymous function + await documentStore.addDocuments("test-promotion", "1.0", 0, { + url: "applyMigrations.ts", + title: "Apply Migrations TypeScript File", + contentType: "text/typescript", + textContent: "", + chunks: [ + { + content: + "export async function applyMigrations(db: Database): Promise {\n const overallTransaction = db.transaction(() => {\n console.log('migrating');\n });\n}", + section: { + path: ["applyMigrations"], + level: 1, + }, + types: ["code"], + }, + { + content: " console.log('migrating');", + section: { + path: ["applyMigrations", ""], + level: 2, + }, + types: ["code"], + }, + ], + links: [], + errors: [], + }); - // Create a page first - const pageResult = (documentStore as any).statements.insertPage.run( - versionId, + // Query the database to get the actual document IDs + const allChunks = await documentStore.findChunksByUrl( + "test-promotion", + "1.0", "applyMigrations.ts", - "Apply Migrations TypeScript File", - null, - null, - "text/typescript", ); - const pageId = pageResult.lastInsertRowid; + expect(allChunks.length).toBe(2); - // Create a simpler, more realistic scenario that matches how the splitter actually works - // Function containing nested arrow function - const topFunctionResult = (documentStore as any).statements.insertDocument.run( - pageId, - "export async function applyMigrations(db: Database): Promise {\n const overallTransaction = db.transaction(() => {\n console.log('migrating');\n });\n}", - JSON.stringify({ - url: "applyMigrations.ts", - path: ["applyMigrations"], - level: 1, - types: ["code", "content"], - }), - 0, - ); - const topFunctionId = topFunctionResult.lastInsertRowid; - - // Nested arrow function inside the main function - const nestedArrowResult = (documentStore as any).statements.insertDocument.run( - pageId, - " console.log('migrating');", - JSON.stringify({ - url: "applyMigrations.ts", - path: ["applyMigrations", ""], - level: 2, - types: ["code", "content"], - }), - 1, - ); - const nestedArrowId = nestedArrowResult.lastInsertRowid; + const topFunctionId = allChunks[0].id; + const nestedArrowId = allChunks[1].id; // Input: search hit on the nested anonymous arrow function - const inputDoc: Document = { - id: nestedArrowId, - pageContent: " console.log('migrating');", - metadata: { - url: "applyMigrations.ts", - path: ["applyMigrations", ""], - level: 2, - types: ["code", "content"], - }, - }; + const inputDoc = allChunks[1]; const result = await strategy.selectChunks( "test-promotion", @@ -279,7 +237,7 @@ describe("HierarchicalAssemblyStrategy", () => { documentStore, ); - const _resultContent = result.map((doc) => doc.pageContent); + const _resultContent = result.map((doc) => doc.content); const resultIds = result.map((doc) => doc.id); // Should promote to include the entire top-level function that contains the anonymous function @@ -297,95 +255,111 @@ describe("HierarchicalAssemblyStrategy", () => { expect(versionId).toBeGreaterThan(0); // Create a page first - const pageResult = (documentStore as any).statements.insertPage.run( + // @ts-expect-error Accessing private property for testing + const pageResult = documentStore.statements.insertPage.run( versionId, "UserService.ts", "User Service TypeScript File", null, null, "text/typescript", + 0, // depth ); - const pageId = pageResult.lastInsertRowid; + const pageId = pageResult.lastInsertRowid as number; // Class with multiple methods - only some will be matched - const _classOpenResult = (documentStore as any).statements.insertDocument.run( + // @ts-expect-error Accessing private property for testing + const _classOpenResult = documentStore.statements.insertDocument.run( pageId, "class UserService {", JSON.stringify({ - url: "UserService.ts", path: ["UserService", "opening"], level: 1, - }), + } satisfies DbChunkMetadata), 0, ); // Method 1: getUser (will be matched) - const getUserResult = (documentStore as any).statements.insertDocument.run( + // @ts-expect-error Accessing private property for testing + const getUserResult = documentStore.statements.insertDocument.run( pageId, " getUser(id) { return db.find(id); }", JSON.stringify({ - url: "UserService.ts", path: ["UserService", "opening", "getUser"], level: 2, - }), + } satisfies DbChunkMetadata), 1, ); const getUserId = getUserResult.lastInsertRowid.toString(); // Method 2: createUser (will NOT be matched) - (documentStore as any).statements.insertDocument.run( + // @ts-expect-error Accessing private property for testing + documentStore.statements.insertDocument.run( pageId, " createUser(data) { return db.create(data); }", JSON.stringify({ - url: "UserService.ts", path: ["UserService", "opening", "createUser"], level: 2, - }), + } satisfies DbChunkMetadata), 2, ); // Method 3: deleteUser (will be matched) - const deleteUserResult = (documentStore as any).statements.insertDocument.run( + // @ts-expect-error Accessing private property for testing + const deleteUserResult = documentStore.statements.insertDocument.run( pageId, " deleteUser(id) { return db.delete(id); }", JSON.stringify({ - url: "UserService.ts", path: ["UserService", "opening", "deleteUser"], level: 2, - }), + } satisfies DbChunkMetadata), 3, ); const deleteUserId = deleteUserResult.lastInsertRowid.toString(); - const inputDocs: Document[] = [ + const inputDocs: DbPageChunk[] = [ { id: getUserId, - pageContent: " getUser(id) { return db.find(id); }", + page_id: pageId, + url: "UserService.ts", + title: "User Service TypeScript File", + content_type: "text/typescript", + content: " getUser(id) { return db.find(id); }", metadata: { - url: "UserService.ts", path: ["UserService", "getUser"], level: 2, }, + sort_order: 1, + embedding: null, + created_at: new Date().toISOString(), + score: null, }, { id: deleteUserId, - pageContent: " deleteUser(id) { return db.delete(id); }", + page_id: pageId, + url: "UserService.ts", + title: "User Service TypeScript File", + content_type: "text/typescript", + content: " deleteUser(id) { return db.delete(id); }", metadata: { - url: "UserService.ts", path: ["UserService", "deleteUser"], level: 2, }, + sort_order: 3, + embedding: null, + created_at: new Date().toISOString(), + score: null, }, ]; const result = await strategy.selectChunks( "test-multi", "1.0", - inputDocs, + inputDocs as DbPageChunk[], documentStore, ); - const content = result.map((doc) => doc.pageContent); + const content = result.map((doc) => doc.content); // Should include both matched methods expect(content).toContain(" getUser(id) { return db.find(id); }"); @@ -401,70 +375,88 @@ describe("HierarchicalAssemblyStrategy", () => { expect(versionId).toBeGreaterThan(0); // Create pages first - const pageAResult = (documentStore as any).statements.insertPage.run( + // @ts-expect-error Accessing private property for testing + const pageAResult = documentStore.statements.insertPage.run( versionId, "FileA.ts", "File A TypeScript File", null, null, "text/typescript", + 0, // depth ); - const pageAId = pageAResult.lastInsertRowid; + const pageAId = pageAResult.lastInsertRowid as number; - const pageBResult = (documentStore as any).statements.insertPage.run( + // @ts-expect-error Accessing private property for testing + const pageBResult = documentStore.statements.insertPage.run( versionId, "FileB.ts", "File B TypeScript File", null, null, "text/typescript", + 0, // depth ); - const pageBId = pageBResult.lastInsertRowid; + const pageBId = pageBResult.lastInsertRowid as number; // File A - const methodAResult = (documentStore as any).statements.insertDocument.run( + // @ts-expect-error Accessing private property for testing + const methodAResult = documentStore.statements.insertDocument.run( pageAId, " methodAlpha() { return 'Alpha'; }", JSON.stringify({ - url: "FileA.ts", path: ["FileA", "methodAlpha"], level: 2, - }), + } satisfies DbChunkMetadata), 0, ); const methodAId = methodAResult.lastInsertRowid.toString(); // File B - const methodBResult = (documentStore as any).statements.insertDocument.run( + // @ts-expect-error Accessing private property for testing + const methodBResult = documentStore.statements.insertDocument.run( pageBId, " methodBeta() { return 'Beta'; }", JSON.stringify({ - url: "FileB.ts", path: ["FileB", "methodBeta"], level: 2, - }), + } satisfies DbChunkMetadata), 0, ); const methodBId = methodBResult.lastInsertRowid.toString(); - const inputDocs: Document[] = [ + const inputDocs: DbPageChunk[] = [ { id: methodAId, - pageContent: " methodAlpha() { return 'Alpha'; }", + page_id: pageAId, + url: "FileA.ts", + title: "File A TypeScript File", + content_type: "text/typescript", + content: " methodAlpha() { return 'Alpha'; }", metadata: { - url: "FileA.ts", path: ["FileA", "methodAlpha"], level: 2, }, + sort_order: 0, + embedding: null, + created_at: new Date().toISOString(), + score: null, }, { id: methodBId, - pageContent: " methodBeta() { return 'Beta'; }", + page_id: pageBId, + url: "FileB.ts", + title: "File B TypeScript File", + content_type: "text/typescript", + content: " methodBeta() { return 'Beta'; }", metadata: { - url: "FileB.ts", path: ["FileB", "methodBeta"], level: 2, }, + sort_order: 0, + embedding: null, + created_at: new Date().toISOString(), + score: null, }, ]; @@ -475,7 +467,7 @@ describe("HierarchicalAssemblyStrategy", () => { documentStore, ); - const content = result.map((d) => d.pageContent); + const content = result.map((d) => d.content); expect(content).toContain(" methodAlpha() { return 'Alpha'; }"); expect(content).toContain(" methodBeta() { return 'Beta'; }"); }); @@ -483,22 +475,22 @@ describe("HierarchicalAssemblyStrategy", () => { describe("assembleContent", () => { it("should concatenate chunks in document order", () => { - const chunks: Document[] = [ + const chunks: DbPageChunk[] = [ { id: "1", - pageContent: "class UserService {", + content: "class UserService {", metadata: {}, - }, + } as DbPageChunk, { id: "2", - pageContent: " getUser() { return 'user'; }", + content: " getUser() { return 'user'; }", metadata: {}, - }, + } as DbPageChunk, { id: "3", - pageContent: "}", + content: "}", metadata: {}, - }, + } as DbPageChunk, ]; const result = strategy.assembleContent(chunks); @@ -511,17 +503,17 @@ describe("HierarchicalAssemblyStrategy", () => { }); it("should provide debug output when requested", () => { - const chunks: Document[] = [ + const chunks: DbPageChunk[] = [ { id: "1", - pageContent: "function test() {", + content: "function test() {", metadata: { path: ["test"], level: 0 }, - }, + } as DbPageChunk, { id: "2", - pageContent: " return 42;", + content: " return 42;", metadata: { path: ["test", "return"], level: 1 }, - }, + } as DbPageChunk, ]; const result = strategy.assembleContent(chunks, true); diff --git a/src/store/assembly/strategies/HierarchicalAssemblyStrategy.ts b/src/store/assembly/strategies/HierarchicalAssemblyStrategy.ts index a4d1f5de..0407361d 100644 --- a/src/store/assembly/strategies/HierarchicalAssemblyStrategy.ts +++ b/src/store/assembly/strategies/HierarchicalAssemblyStrategy.ts @@ -1,7 +1,7 @@ -import type { Document } from "@langchain/core/documents"; import { logger } from "../../../utils/logger"; import { MimeTypeUtils } from "../../../utils/mimeTypeUtils"; import type { DocumentStore } from "../../DocumentStore"; +import type { DbPageChunk } from "../../types"; import type { ContentAssemblyStrategy } from "../types"; /** @@ -44,18 +44,18 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { async selectChunks( library: string, version: string, - initialChunks: Document[], + initialChunks: DbPageChunk[], documentStore: DocumentStore, - ): Promise { + ): Promise { if (initialChunks.length === 0) { return []; } try { // Group chunks by document URL - const chunksByDocument = new Map(); + const chunksByDocument = new Map(); for (const chunk of initialChunks) { - const url = chunk.metadata.url as string; + const url = chunk.url; if (!chunksByDocument.has(url)) { chunksByDocument.set(url, []); } @@ -111,7 +111,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { } // IMPORTANT: Always include the original matched chunk first - allChunkIds.add(matched.id as string); + allChunkIds.add(matched.id); // Use promoted ancestor (may still be the original matched chunk if promotion not applicable) const ancestorParentChain = await this.walkToRoot( @@ -138,7 +138,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { // Multiple matches: use selective subtree reassembly // IMPORTANT: Always include all original matched chunks first for (const matched of documentChunks) { - allChunkIds.add(matched.id as string); + allChunkIds.add(matched.id); } const subtreeIds = await this.selectSubtreeChunks( @@ -171,18 +171,18 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { * Assembles chunks using simple concatenation. * Relies on splitter concatenation guarantees - chunks are designed to join seamlessly. */ - assembleContent(chunks: Document[], debug = false): string { + assembleContent(chunks: DbPageChunk[], debug = false): string { if (debug) { return chunks .map( (chunk) => `=== #${chunk.id} ${chunk.metadata.path?.join("/")} [${chunk.metadata.level}] ===\n` + - chunk.pageContent, + chunk.content, ) .join(""); } // Production/default: simple concatenation leveraging splitter guarantees. - return chunks.map((chunk) => chunk.pageContent).join(""); + return chunks.map((chunk) => chunk.content).join(""); } /** @@ -197,18 +197,18 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { private async walkToRoot( library: string, version: string, - chunk: Document, + chunk: DbPageChunk, documentStore: DocumentStore, ): Promise { const chainIds: string[] = []; const visited = new Set(); - let currentChunk: Document | null = chunk; + let currentChunk: DbPageChunk | null = chunk; const maxDepth = 50; // Safety limit to prevent runaway loops let depth = 0; // Walk up parent chain until we reach the root while (currentChunk && depth < maxDepth) { - const currentId = currentChunk.id as string; + const currentId = currentChunk.id; // Check for circular references if (visited.has(currentId)) { @@ -235,22 +235,20 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { currentChunk = await this.findAncestorWithGaps( library, version, - currentChunk.metadata as { url: string; path?: string[] }, + currentChunk.url, + currentChunk.metadata.path ?? [], documentStore, ); } } catch (error) { // If standard lookup fails, try gap-aware ancestor search try { - const currentMetadata = currentChunk?.metadata as { - url: string; - path?: string[]; - }; - if (currentMetadata) { + if (currentChunk) { currentChunk = await this.findAncestorWithGaps( library, version, - currentMetadata, + currentChunk.url, + currentChunk.metadata.path ?? [], documentStore, ); } else { @@ -281,12 +279,10 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { private async findAncestorWithGaps( library: string, version: string, - metadata: { url: string; path?: string[] }, + url: string, + path: string[], documentStore: DocumentStore, - ): Promise { - const path = metadata.path || []; - const url = metadata.url; - + ): Promise { if (path.length <= 1) { return null; // Already at or near root } @@ -331,7 +327,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { url: string, targetPath: string[], documentStore: DocumentStore, - ): Promise { + ): Promise { try { // Get all chunks from the same document URL const allChunks = await documentStore.findChunksByUrl(library, version, url); @@ -342,7 +338,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { const matchingChunks = allChunks.filter((chunk) => { const chunkPath = (chunk.metadata.path as string[]) || []; - const chunkUrl = chunk.metadata.url as string; + const chunkUrl = chunk.url; // Must be in the same document if (chunkUrl !== url) return false; @@ -368,13 +364,13 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { private async findStructuralAncestor( library: string, version: string, - chunk: Document, + chunk: DbPageChunk, documentStore: DocumentStore, - ): Promise { - let current: Document | null = chunk; + ): Promise { + let current: DbPageChunk | null = chunk; // If current is structural already, return it - const isStructural = (c: Document | null) => + const isStructural = (c: DbPageChunk | null) => !!c && Array.isArray(c.metadata?.types) && c.metadata.types.includes("structural"); if (isStructural(current)) { @@ -383,11 +379,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { // Walk up until we find a structural ancestor while (true) { - const parent = await documentStore.findParentChunk( - library, - version, - current.id as string, - ); + const parent = await documentStore.findParentChunk(library, version, current.id); if (!parent) { return null; } @@ -405,7 +397,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { private async selectSubtreeChunks( library: string, version: string, - documentChunks: Document[], + documentChunks: DbPageChunk[], documentStore: DocumentStore, ): Promise { const chunkIds = new Set(); @@ -458,7 +450,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { /** * Finds the common ancestor path from a list of chunks by finding the longest common prefix. */ - private findCommonAncestorPath(chunks: Document[]): string[] { + private findCommonAncestorPath(chunks: DbPageChunk[]): string[] { if (chunks.length === 0) return []; if (chunks.length === 1) return (chunks[0].metadata.path as string[]) ?? []; @@ -488,7 +480,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { private async findContainerChunks( library: string, version: string, - referenceChunk: Document, + referenceChunk: DbPageChunk, ancestorPath: string[], documentStore: DocumentStore, ): Promise { @@ -500,13 +492,13 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { const ancestorChunks = await this.findChunksByExactPath( library, version, - referenceChunk.metadata.url as string, + referenceChunk.url, ancestorPath, documentStore, ); for (const chunk of ancestorChunks) { - containerIds.push(chunk.id as string); + containerIds.push(chunk.id); } } catch (error) { logger.warn( @@ -527,7 +519,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { url: string, path: string[], documentStore: DocumentStore, - ): Promise { + ): Promise { try { // For root path, return empty - no specific chunks to find if (path.length === 0) { @@ -569,17 +561,17 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { private async findSubtreeChunks( library: string, version: string, - rootChunk: Document, + rootChunk: DbPageChunk, documentStore: DocumentStore, ): Promise { const subtreeIds: string[] = []; const visited = new Set(); - const queue: Document[] = [rootChunk]; + const queue: DbPageChunk[] = [rootChunk]; while (queue.length > 0) { // biome-ignore lint/style/noNonNullAssertion: this is safe due to the while condition const currentChunk = queue.shift()!; - const currentId = currentChunk.id as string; + const currentId = currentChunk.id; if (visited.has(currentId)) continue; visited.add(currentId); @@ -609,21 +601,21 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { private async fallbackSelection( library: string, version: string, - initialChunks: Document[], + initialChunks: DbPageChunk[], documentStore: DocumentStore, - ): Promise { + ): Promise { const chunkIds = new Set(); // Just include the initial chunks and their immediate parents/children for (const chunk of initialChunks) { - const id = chunk.id as string; + const id = chunk.id; chunkIds.add(id); // Add parent for context try { const parent = await documentStore.findParentChunk(library, version, id); if (parent) { - chunkIds.add(parent.id as string); + chunkIds.add(parent.id); } } catch (error) { logger.warn(`Failed to find parent for chunk ${id}: ${error}`); diff --git a/src/store/assembly/strategies/MarkdownAssemblyStrategy.test.ts b/src/store/assembly/strategies/MarkdownAssemblyStrategy.test.ts index f22e1613..ef56e38b 100644 --- a/src/store/assembly/strategies/MarkdownAssemblyStrategy.test.ts +++ b/src/store/assembly/strategies/MarkdownAssemblyStrategy.test.ts @@ -1,6 +1,6 @@ -import { Document } from "@langchain/core/documents"; import { beforeEach, describe, expect, it, vi } from "vitest"; import type { DocumentStore } from "../../DocumentStore"; +import type { DbPageChunk } from "../../types"; import { MarkdownAssemblyStrategy } from "./MarkdownAssemblyStrategy"; // Mock DocumentStore with just the methods we need @@ -17,88 +17,96 @@ const createMockDocumentStore = () => const createDocumentUniverse = () => { return { // Target chunk (the one we're finding relations for) - target: new Document({ + target: { id: "target", - pageContent: "Target content", - metadata: { url: "https://example.com", path: ["Chapter 1", "Section 1.1"] }, - }), + content: "Target content", + url: "https://example.com", + metadata: { path: ["Chapter 1", "Section 1.1"] }, + } as DbPageChunk, // Parent - parent: new Document({ + parent: { id: "parent", - pageContent: "Parent section content", - metadata: { url: "https://example.com", path: ["Chapter 1"] }, - }), + content: "Parent section content", + url: "https://example.com", + metadata: { path: ["Chapter 1"] }, + } as DbPageChunk, // Children (limit = 3, so child4 should be excluded) - child1: new Document({ + child1: { id: "child1", - pageContent: "First child content", + content: "First child content", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1", "Section 1.1", "Subsection A"], }, - }), - child2: new Document({ + } as DbPageChunk, + child2: { id: "child2", - pageContent: "Second child content", + content: "Second child content", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1", "Section 1.1", "Subsection B"], }, - }), - child3: new Document({ + } as DbPageChunk, + child3: { id: "child3", - pageContent: "Third child content", + content: "Third child content", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1", "Section 1.1", "Subsection C"], }, - }), - child4: new Document({ + } as DbPageChunk, + child4: { id: "child4", - pageContent: "Fourth child content (should be excluded)", + content: "Fourth child content (should be excluded)", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1", "Section 1.1", "Subsection D"], }, - }), + } as DbPageChunk, // Preceding siblings (limit = 1, so only prev1 should be included) - prev1: new Document({ + prev1: { id: "prev1", - pageContent: "Previous sibling 1", - metadata: { url: "https://example.com", path: ["Chapter 1", "Section 1.0"] }, - }), - prev2: new Document({ + content: "Previous sibling 1", + url: "https://example.com", + metadata: { path: ["Chapter 1", "Section 1.0"] }, + } as DbPageChunk, + prev2: { id: "prev2", - pageContent: "Previous sibling 2 (should be excluded)", - metadata: { url: "https://example.com", path: ["Chapter 1", "Section 0.9"] }, - }), + content: "Previous sibling 2 (should be excluded)", + url: "https://example.com", + metadata: { path: ["Chapter 1", "Section 0.9"] }, + } as DbPageChunk, // Subsequent siblings (limit = 2) - next1: new Document({ + next1: { id: "next1", - pageContent: "Next sibling 1", - metadata: { url: "https://example.com", path: ["Chapter 1", "Section 1.2"] }, - }), - next2: new Document({ + content: "Next sibling 1", + url: "https://example.com", + metadata: { path: ["Chapter 1", "Section 1.2"] }, + } as DbPageChunk, + next2: { id: "next2", - pageContent: "Next sibling 2", - metadata: { url: "https://example.com", path: ["Chapter 1", "Section 1.3"] }, - }), - next3: new Document({ + content: "Next sibling 2", + url: "https://example.com", + metadata: { path: ["Chapter 1", "Section 1.3"] }, + } as DbPageChunk, + next3: { id: "next3", - pageContent: "Next sibling 3 (should be excluded)", - metadata: { url: "https://example.com", path: ["Chapter 1", "Section 1.4"] }, - }), + content: "Next sibling 3 (should be excluded)", + url: "https://example.com", + metadata: { path: ["Chapter 1", "Section 1.4"] }, + } as DbPageChunk, // Orphan chunk (no relations) - orphan: new Document({ + orphan: { id: "orphan", - pageContent: "Orphan content", - metadata: { url: "https://example.com/other", path: ["Standalone"] }, - }), + content: "Orphan content", + url: "https://example.com/other", + metadata: { path: ["Standalone"] }, + } as DbPageChunk, }; }; @@ -206,11 +214,11 @@ describe("MarkdownAssemblyStrategy", () => { }); it("handles chunks with existing newlines", () => { - const chunkWithNewlines = new Document({ + const chunkWithNewlines = { id: "newlines", - pageContent: "Line 1\nLine 2\n\nLine 4", + content: "Line 1\nLine 2\n\nLine 4", metadata: {}, - }); + } as DbPageChunk; const result = strategy.assembleContent([universe.target, chunkWithNewlines]); expect(result).toBe("Target content\n\nLine 1\nLine 2\n\nLine 4"); @@ -575,10 +583,10 @@ describe("MarkdownAssemblyStrategy", () => { }); it("handles chunks without IDs gracefully", async () => { - const invalidChunk = new Document({ - pageContent: "No ID chunk", + const invalidChunk = { + content: "No ID chunk", metadata: {}, - }); + } as DbPageChunk; // Mock all store methods to return empty arrays for undefined IDs vi.mocked(mockStore.findParentChunk).mockResolvedValue(null); diff --git a/src/store/assembly/strategies/MarkdownAssemblyStrategy.ts b/src/store/assembly/strategies/MarkdownAssemblyStrategy.ts index 9a4f61c3..8c391370 100644 --- a/src/store/assembly/strategies/MarkdownAssemblyStrategy.ts +++ b/src/store/assembly/strategies/MarkdownAssemblyStrategy.ts @@ -1,6 +1,6 @@ -import type { Document } from "@langchain/core/documents"; import { MimeTypeUtils } from "../../../utils/mimeTypeUtils"; import type { DocumentStore } from "../../DocumentStore"; +import type { DbPageChunk } from "../../types"; import type { ContentAssemblyStrategy } from "../types"; const CHILD_LIMIT = 3; @@ -53,9 +53,9 @@ export class MarkdownAssemblyStrategy implements ContentAssemblyStrategy { async selectChunks( library: string, version: string, - initialChunks: Document[], + initialChunks: DbPageChunk[], documentStore: DocumentStore, - ): Promise { + ): Promise { const allChunkIds = new Set(); // Process all initial chunks in parallel to gather related chunk IDs @@ -82,8 +82,8 @@ export class MarkdownAssemblyStrategy implements ContentAssemblyStrategy { /** * Assembles chunks using simple "\n\n" joining (current behavior). */ - assembleContent(chunks: Document[]): string { - return chunks.map((chunk) => chunk.pageContent).join("\n\n"); + assembleContent(chunks: DbPageChunk[]): string { + return chunks.map((chunk) => chunk.content).join("\n\n"); } /** @@ -93,10 +93,10 @@ export class MarkdownAssemblyStrategy implements ContentAssemblyStrategy { private async getRelatedChunkIds( library: string, version: string, - doc: Document, + doc: DbPageChunk, documentStore: DocumentStore, ): Promise> { - const id = doc.id as string; + const id = doc.id; const relatedIds = new Set(); // Add the original chunk @@ -105,7 +105,7 @@ export class MarkdownAssemblyStrategy implements ContentAssemblyStrategy { // Parent const parent = await documentStore.findParentChunk(library, version, id); if (parent) { - relatedIds.add(parent.id as string); + relatedIds.add(parent.id); } // Preceding Siblings @@ -116,7 +116,7 @@ export class MarkdownAssemblyStrategy implements ContentAssemblyStrategy { PRECEDING_SIBLINGS_LIMIT, ); for (const sib of precedingSiblings) { - relatedIds.add(sib.id as string); + relatedIds.add(sib.id); } // Child Chunks @@ -127,7 +127,7 @@ export class MarkdownAssemblyStrategy implements ContentAssemblyStrategy { CHILD_LIMIT, ); for (const child of childChunks) { - relatedIds.add(child.id as string); + relatedIds.add(child.id); } // Subsequent Siblings @@ -138,7 +138,7 @@ export class MarkdownAssemblyStrategy implements ContentAssemblyStrategy { SUBSEQUENT_SIBLINGS_LIMIT, ); for (const sib of subsequentSiblings) { - relatedIds.add(sib.id as string); + relatedIds.add(sib.id); } return relatedIds; diff --git a/src/store/assembly/types.ts b/src/store/assembly/types.ts index bf926f17..a0c2a8ac 100644 --- a/src/store/assembly/types.ts +++ b/src/store/assembly/types.ts @@ -1,5 +1,5 @@ -import type { Document } from "@langchain/core/documents"; import type { DocumentStore } from "../DocumentStore"; +import type { DbPageChunk } from "../types"; /** * Strategy interface for content-type-aware search result assembly. @@ -28,9 +28,9 @@ export interface ContentAssemblyStrategy { selectChunks( library: string, version: string, - initialChunks: Document[], + initialChunks: DbPageChunk[], documentStore: DocumentStore, - ): Promise; + ): Promise; /** * Assembles the selected chunks into final content. @@ -38,7 +38,7 @@ export interface ContentAssemblyStrategy { * @param chunks The chunks to assemble (already in proper order) * @returns The assembled content string */ - assembleContent(chunks: Document[]): string; + assembleContent(chunks: DbPageChunk[]): string; } /** @@ -46,7 +46,7 @@ export interface ContentAssemblyStrategy { */ export interface ContentAssemblyContext { /** The chunks that matched the search query */ - initialChunks: Document[]; + initialChunks: DbPageChunk[]; /** MIME type of the content (from first chunk metadata) */ mimeType?: string; /** Document URL for grouping */ @@ -60,7 +60,7 @@ export interface ContentAssemblyContext { */ export interface ChunkSelectionResult { /** Selected chunks in proper order for assembly */ - chunks: Document[]; + chunks: DbPageChunk[]; /** Strategy that was used for selection */ strategy: string; /** Any warnings or notes about the selection process */ diff --git a/src/store/types.ts b/src/store/types.ts index 358d1622..6a3b59ee 100644 --- a/src/store/types.ts +++ b/src/store/types.ts @@ -1,5 +1,4 @@ import type { ScrapeMode } from "../scraper/types"; -import type { DocumentMetadata } from "../types"; /** Default vector dimension used across the application */ export const VECTOR_DIMENSION = 1536; @@ -15,18 +14,33 @@ export interface DbPage { etag: string | null; last_modified: string | null; content_type: string | null; + depth: number | null; created_at: string; updated_at: string; } +/** + * Chunk-level metadata stored with each document chunk. + * Contains hierarchical information about the chunk's position within the page. + */ +export interface DbChunkMetadata { + level?: number; // Hierarchical level in document + path?: string[]; // Hierarchical path in document + // TODO: Check if `types` is properly use + types?: string[]; // Types of content in this chunk (e.g., "text", "code", "table") + // FIXME: Enable additional metadata fields again once we have a clear schema for what metadata we want to store with each chunk. + // Allow for additional chunk-specific metadata + // [key: string]: unknown; +} + /** * Database document record type matching the documents table schema */ -export interface DbDocument { +export interface DbChunk { id: string; page_id: number; // Foreign key to pages table content: string; - metadata: string; // JSON string of chunk-specific metadata (level, path, etc.) + metadata: DbChunkMetadata; // Chunk-specific metadata (level, path, etc.) sort_order: number; embedding: Buffer | null; // Binary blob for embeddings created_at: string; @@ -37,36 +51,26 @@ export interface DbDocument { * Represents the result of a JOIN between the documents and pages tables. * It includes all fields from a document chunk plus the relevant page-level metadata. */ -export interface DbJoinedDocument extends DbDocument { +export interface DbPageChunk extends DbChunk { url: string; - title: string | null; - content_type: string | null; + title?: string | null; + content_type?: string | null; } /** - * Utility type for handling SQLite query results that may be undefined + * Represents the ranking information for a search result, including both + * vector and full-text search ranks. */ -export type DbQueryResult = T | undefined; +export interface DbChunkRank { + score: number; + vec_rank?: number; + fts_rank?: number; +} /** - * Maps raw database document with joined page data to the Document type used by the application. - * Now uses the explicit DbJoinedDocument type for improved type safety. + * Utility type for handling SQLite query results that may be undefined */ -export function mapDbDocumentToDocument(doc: DbJoinedDocument) { - const chunkMetadata = JSON.parse(doc.metadata) as DocumentMetadata; - - return { - id: doc.id, - pageContent: doc.content, - metadata: { - ...chunkMetadata, - // Page-level fields are always available from joined queries - url: doc.url, - title: doc.title || "", // Convert null to empty string for consistency - ...(doc.content_type && { contentType: doc.content_type }), - } as DocumentMetadata, - }; -} +export type DbQueryResult = T | undefined; /** * Search result type returned by the DocumentRetrieverService @@ -75,7 +79,7 @@ export interface StoreSearchResult { url: string; content: string; score: number | null; - mimeType?: string; + mimeType?: string | null; } /** @@ -299,3 +303,20 @@ export function isActiveStatus(status: VersionStatus): boolean { status, ); } + +/** + * Library version row returned by queryLibraryVersions. + * Aggregates version metadata with document counts and indexing status. + */ +export interface DbLibraryVersion { + library: string; + version: string; + versionId: number; + status: VersionStatus; + progressPages: number; + progressMaxPages: number; + sourceUrl: string | null; + documentCount: number; + uniqueUrlCount: number; + indexedAt: string | null; +} diff --git a/src/tools/FetchUrlTool.ts b/src/tools/FetchUrlTool.ts index 5b0e3fdf..669a31b6 100644 --- a/src/tools/FetchUrlTool.ts +++ b/src/tools/FetchUrlTool.ts @@ -2,7 +2,7 @@ import type { AutoDetectFetcher, RawContent } from "../scraper/fetcher"; import { HtmlPipeline } from "../scraper/pipelines/HtmlPipeline"; import { MarkdownPipeline } from "../scraper/pipelines/MarkdownPipeline"; import { TextPipeline } from "../scraper/pipelines/TextPipeline"; -import type { ContentPipeline, ProcessedContent } from "../scraper/pipelines/types"; +import type { ContentPipeline, PipelineResult } from "../scraper/pipelines/types"; import { ScrapeMode } from "../scraper/types"; import { convertToString } from "../scraper/utils/buffer"; import { resolveCharset } from "../scraper/utils/charset"; @@ -96,9 +96,9 @@ export class FetchUrlTool { logger.info("🔄 Processing content..."); - let processed: Awaited | undefined; + let processed: Awaited | undefined; for (const pipeline of this.pipelines) { - if (pipeline.canProcess(rawContent)) { + if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) { processed = await pipeline.process( rawContent, { @@ -135,7 +135,7 @@ export class FetchUrlTool { return contentString; } - for (const err of processed.errors) { + for (const err of processed.errors ?? []) { logger.warn(`⚠️ Processing error for ${url}: ${err.message}`); } diff --git a/src/tools/ListJobsTool.test.ts b/src/tools/ListJobsTool.test.ts index b6b8cdc0..c3ed6f45 100644 --- a/src/tools/ListJobsTool.test.ts +++ b/src/tools/ListJobsTool.test.ts @@ -1,7 +1,7 @@ import { beforeEach, describe, expect, it, type Mock, vi } from "vitest"; import type { PipelineManager } from "../pipeline/PipelineManager"; import { type PipelineJob, PipelineJobStatus } from "../pipeline/types"; -import type { ScraperProgress } from "../scraper/types"; +import type { ScraperProgressEvent } from "../scraper/types"; import { VersionStatus } from "../store/types"; import { ListJobsTool } from "./ListJobsTool"; @@ -49,7 +49,7 @@ describe("ListJobsTool", () => { currentUrl: "url2/page5", depth: 1, maxDepth: 3, - } as ScraperProgress, + } as ScraperProgressEvent, error: null, finishedAt: null, // Database fields @@ -60,7 +60,12 @@ describe("ListJobsTool", () => { errorMessage: null, updatedAt: new Date("2023-01-01T11:05:00Z"), sourceUrl: "url2", - scraperOptions: { maxDepth: 3 }, + scraperOptions: { + url: "url2", + library: "lib-b", + version: "2.0.0", + maxDepth: 3, + }, }, { id: "job-3", @@ -76,7 +81,7 @@ describe("ListJobsTool", () => { currentUrl: "url3/page10", depth: 2, maxDepth: 2, - } as ScraperProgress, + } as ScraperProgressEvent, error: null, // Database fields versionId: 3, @@ -86,7 +91,12 @@ describe("ListJobsTool", () => { errorMessage: null, updatedAt: new Date("2023-01-01T12:15:00Z"), sourceUrl: "url3", - scraperOptions: { maxDepth: 2 }, + scraperOptions: { + url: "url3", + library: "lib-a", + version: "1.1.0", + maxDepth: 2, + }, }, ]; diff --git a/src/tools/ScrapeTool.test.ts b/src/tools/ScrapeTool.test.ts index 567b3376..c04d88f9 100644 --- a/src/tools/ScrapeTool.test.ts +++ b/src/tools/ScrapeTool.test.ts @@ -21,7 +21,7 @@ describe("ScrapeTool", () => { // Mock the manager instance methods mockManagerInstance = { start: vi.fn().mockResolvedValue(undefined), - enqueueJob: vi.fn().mockResolvedValue(MOCK_JOB_ID), // Return a mock job ID + enqueueScrapeJob: vi.fn().mockResolvedValue(MOCK_JOB_ID), // Return a mock job ID waitForJobCompletion: vi.fn().mockResolvedValue(undefined), // Default success getJob: vi.fn().mockResolvedValue({ // Mock getJob for final status check @@ -64,13 +64,13 @@ describe("ScrapeTool", () => { const options = getBaseOptions(input); await scrapeTool.execute(options); - // Check enqueueJob call (implies constructor was called) + // Check enqueueScrapeJob call (implies constructor was called) const expectedVersionArg = typeof expectedInternal === "string" ? expectedInternal.toLowerCase() : expectedInternal; // null stays null - expect(mockManagerInstance.enqueueJob).toHaveBeenCalledWith( + expect(mockManagerInstance.enqueueScrapeJob).toHaveBeenCalledWith( "test-lib", expectedVersionArg, expect.objectContaining({ url: options.url }), // Check basic options passed @@ -87,7 +87,7 @@ describe("ScrapeTool", () => { await expect(scrapeTool.execute(options)).rejects.toThrow( /Invalid version format for scraping/, ); - expect(mockManagerInstance.enqueueJob).not.toHaveBeenCalled(); + expect(mockManagerInstance.enqueueScrapeJob).not.toHaveBeenCalled(); }, ); @@ -105,8 +105,8 @@ describe("ScrapeTool", () => { }; await scrapeTool.execute(options); - // Check enqueueJob options - expect(mockManagerInstance.enqueueJob).toHaveBeenCalledWith( + // Check enqueueScrapeJob options + expect(mockManagerInstance.enqueueScrapeJob).toHaveBeenCalledWith( "test-lib", "1.0.0", // Normalized and lowercased { @@ -148,7 +148,7 @@ describe("ScrapeTool", () => { const result = await scrapeTool.execute(options); expect(result).toEqual({ jobId: MOCK_JOB_ID }); - expect(mockManagerInstance.enqueueJob).toHaveBeenCalledOnce(); + expect(mockManagerInstance.enqueueScrapeJob).toHaveBeenCalledOnce(); expect(mockManagerInstance.waitForJobCompletion).not.toHaveBeenCalled(); // Should not wait }); @@ -156,7 +156,7 @@ describe("ScrapeTool", () => { const options = getBaseOptions("1.0.0"); // waitForCompletion is omitted (defaults to true) await scrapeTool.execute(options); - expect(mockManagerInstance.enqueueJob).toHaveBeenCalledOnce(); + expect(mockManagerInstance.enqueueScrapeJob).toHaveBeenCalledOnce(); expect(mockManagerInstance.waitForJobCompletion).toHaveBeenCalledWith(MOCK_JOB_ID); // Should wait }); @@ -166,7 +166,7 @@ describe("ScrapeTool", () => { (mockManagerInstance.waitForJobCompletion as Mock).mockRejectedValue(jobError); await expect(scrapeTool.execute(options)).rejects.toThrow("Job failed"); - expect(mockManagerInstance.enqueueJob).toHaveBeenCalledOnce(); // Job was still enqueued + expect(mockManagerInstance.enqueueScrapeJob).toHaveBeenCalledOnce(); // Job was still enqueued }); it("should pass custom headers to the pipeline manager", async () => { @@ -180,7 +180,7 @@ describe("ScrapeTool", () => { }, }; await scrapeTool.execute(options); - expect(mockManagerInstance.enqueueJob).toHaveBeenCalledWith( + expect(mockManagerInstance.enqueueScrapeJob).toHaveBeenCalledWith( "test-lib", "2.0.0", expect.objectContaining({ diff --git a/src/types/index.ts b/src/types/index.ts index 0a34eac8..8acee190 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -1,46 +1,3 @@ -/** - * Common document content type shared across modules - */ -export interface Document { - content: string; - metadata: DocumentMetadata; - contentType?: string; // MIME type of the original content -} - -/** - * Page-level metadata stored in the pages table - */ -export interface PageMetadata { - url: string; - title: string; - etag?: string; - lastModified?: string; - contentType?: string; -} - -/** - * Chunk-level metadata stored with each document chunk - */ -export interface ChunkMetadata { - level?: number; // Hierarchical level in document - path?: string[]; // Hierarchical path in document - // Allow for additional chunk-specific metadata - [key: string]: unknown; -} - -/** - * Common metadata fields shared across document chunks - * This combines page-level and chunk-level metadata for backward compatibility - */ -export interface DocumentMetadata extends ChunkMetadata { - url: string; - title: string; - library: string; - version: string; - level?: number; // Optional during scraping - path?: string[]; // Optional during scraping -} - /** * Generic progress callback type */ diff --git a/test/vector-search-e2e.test.ts b/test/vector-search-e2e.test.ts index d0e508cb..66755638 100644 --- a/test/vector-search-e2e.test.ts +++ b/test/vector-search-e2e.test.ts @@ -212,7 +212,7 @@ describe("Vector Search End-to-End Tests", () => { library: "non-existent-library", version: "1.0.0", query: "test query", - })).rejects.toThrow("Library 'non-existent-library' not found"); + })).rejects.toThrow("Library non-existent-library not found in store. Did you mean: test-library?"); }, 10000); it("should handle non-existent version searches gracefully", async () => { From 0786dbc54530950257607bb110c2f4c70a5be27d Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Sat, 1 Nov 2025 14:13:28 -0700 Subject: [PATCH 05/20] test: add nock for HTTP mocking, and implement refresh pipeline tests --- package-lock.json | 101 ++++- package.json | 1 + src/scraper/fetcher/HttpFetcher.ts | 4 + src/scraper/strategies/WebScraperStrategy.ts | 70 +-- src/store/DocumentManagementService.ts | 7 +- src/store/DocumentStore.ts | 1 + test/refresh-pipeline-e2e.test.ts | 443 +++++++++++++++++++ 7 files changed, 576 insertions(+), 51 deletions(-) create mode 100644 test/refresh-pipeline-e2e.test.ts diff --git a/package-lock.json b/package-lock.json index 29b6a142..4e6c5f09 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@arabold/docs-mcp-server", - "version": "1.26.0", + "version": "1.26.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@arabold/docs-mcp-server", - "version": "1.26.0", + "version": "1.26.1", "hasInstallScript": true, "license": "MIT", "dependencies": { @@ -84,6 +84,7 @@ "husky": "^9.1.7", "lint-staged": "^16.1.2", "memfs": "^4.34.0", + "nock": "^14.0.10", "npm-run-all": "^4.1.5", "postcss": "^8.5.6", "semantic-release": "^24.2.7", @@ -2989,6 +2990,24 @@ "zod": "^3.24.1" } }, + "node_modules/@mswjs/interceptors": { + "version": "0.39.8", + "resolved": "https://registry.npmjs.org/@mswjs/interceptors/-/interceptors-0.39.8.tgz", + "integrity": "sha512-2+BzZbjRO7Ct61k8fMNHEtoKjeWI9pIlHFTqBwZ5icHpqszIgEZbjb1MW5Z0+bITTCTl3gk4PDBxs9tA/csXvA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@open-draft/deferred-promise": "^2.2.0", + "@open-draft/logger": "^0.3.0", + "@open-draft/until": "^2.0.0", + "is-node-process": "^1.2.0", + "outvariant": "^1.4.3", + "strict-event-emitter": "^0.5.1" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/@nodelib/fs.scandir": { "version": "2.1.5", "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", @@ -3183,6 +3202,31 @@ "@octokit/openapi-types": "^25.1.0" } }, + "node_modules/@open-draft/deferred-promise": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@open-draft/deferred-promise/-/deferred-promise-2.2.0.tgz", + "integrity": "sha512-CecwLWx3rhxVQF6V4bAgPS5t+So2sTbPgAzafKkVizyi7tlwpcFpdFqq+wqF2OwNBmqFuu6tOyouTuxgpMfzmA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@open-draft/logger": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/@open-draft/logger/-/logger-0.3.0.tgz", + "integrity": "sha512-X2g45fzhxH238HKO4xbSr7+wBS8Fvw6ixhTDuvLd5mqh6bJJCFAPwU9mPDxbcrRtfxv4u5IHCEH77BmxvXmmxQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-node-process": "^1.2.0", + "outvariant": "^1.4.0" + } + }, + "node_modules/@open-draft/until": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/@open-draft/until/-/until-2.1.0.tgz", + "integrity": "sha512-U69T3ItWHvLwGg5eJ0n3I62nWuE6ilHlmz7zM0npLBRvPRd7e6NYmg54vvRtP5mZG7kZqZCFVdsTWo7BPtBujg==", + "dev": true, + "license": "MIT" + }, "node_modules/@pnpm/config.env-replace": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/@pnpm/config.env-replace/-/config.env-replace-1.1.0.tgz", @@ -10225,6 +10269,13 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/is-node-process": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/is-node-process/-/is-node-process-1.2.0.tgz", + "integrity": "sha512-Vg4o6/fqPxIjtxgUH5QLJhwZ7gW5diGCVlXpuUfELC62CuxM1iHcRe51f2W1FDy04Ai4KJkagKjx3XaqyfRKXw==", + "dev": true, + "license": "MIT" + }, "node_modules/is-number": { "version": "7.0.0", "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", @@ -10677,6 +10728,13 @@ "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", "license": "MIT" }, + "node_modules/json-stringify-safe": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz", + "integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==", + "dev": true, + "license": "ISC" + }, "node_modules/jsonfile": { "version": "6.2.0", "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.2.0.tgz", @@ -12892,6 +12950,21 @@ "dev": true, "license": "MIT" }, + "node_modules/nock": { + "version": "14.0.10", + "resolved": "https://registry.npmjs.org/nock/-/nock-14.0.10.tgz", + "integrity": "sha512-Q7HjkpyPeLa0ZVZC5qpxBt5EyLczFJ91MEewQiIi9taWuA0KB/MDJlUWtON+7dGouVdADTQsf9RA7TZk6D8VMw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@mswjs/interceptors": "^0.39.5", + "json-stringify-safe": "^5.0.1", + "propagate": "^2.0.0" + }, + "engines": { + "node": ">=18.20.0 <20 || >=20.12.1" + } + }, "node_modules/node-abi": { "version": "3.75.0", "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.75.0.tgz", @@ -16102,6 +16175,13 @@ "integrity": "sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==", "license": "MIT" }, + "node_modules/outvariant": { + "version": "1.4.3", + "resolved": "https://registry.npmjs.org/outvariant/-/outvariant-1.4.3.tgz", + "integrity": "sha512-+Sl2UErvtsoajRDKCE5/dBz4DIvHXQQnAxtQTF04OJxY0+DyZXSo5P5Bb7XYWOh81syohlYL24hbDwxedPUJCA==", + "dev": true, + "license": "MIT" + }, "node_modules/ow": { "version": "0.28.2", "resolved": "https://registry.npmjs.org/ow/-/ow-0.28.2.tgz", @@ -16844,6 +16924,16 @@ ], "license": "MIT" }, + "node_modules/propagate": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/propagate/-/propagate-2.0.1.tgz", + "integrity": "sha512-vGrhOavPSTz4QVNuBNdcNXePNdNMaO1xj9yBeH1ScQPjk/rhg9sSlCXPhMkFuaNNW/syTvYqsnbIJxMBfRbbag==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, "node_modules/property-information": { "version": "7.1.0", "resolved": "https://registry.npmjs.org/property-information/-/property-information-7.1.0.tgz", @@ -18586,6 +18676,13 @@ "readable-stream": "^2.0.2" } }, + "node_modules/strict-event-emitter": { + "version": "0.5.1", + "resolved": "https://registry.npmjs.org/strict-event-emitter/-/strict-event-emitter-0.5.1.tgz", + "integrity": "sha512-vMgjE/GGEPEFnhFub6pa4FmJBRBVOLpIII2hvCZ8Kzb7K0hlHo7mQv6xYrBvCL2LtAIBwFUK8wvuJgTVSQ5MFQ==", + "dev": true, + "license": "MIT" + }, "node_modules/string_decoder": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", diff --git a/package.json b/package.json index 7fcfcc1f..0beb7706 100644 --- a/package.json +++ b/package.json @@ -112,6 +112,7 @@ "husky": "^9.1.7", "lint-staged": "^16.1.2", "memfs": "^4.34.0", + "nock": "^14.0.10", "npm-run-all": "^4.1.5", "postcss": "^8.5.6", "semantic-release": "^24.2.7", diff --git a/src/scraper/fetcher/HttpFetcher.ts b/src/scraper/fetcher/HttpFetcher.ts index b9bfe849..46d77f37 100644 --- a/src/scraper/fetcher/HttpFetcher.ts +++ b/src/scraper/fetcher/HttpFetcher.ts @@ -144,6 +144,10 @@ export class HttpFetcher implements ContentFetcher { // Axios follows redirects by default, we need to explicitly disable it if needed maxRedirects: followRedirects ? 5 : 0, decompress: true, + // Allow 304 responses to be handled as successful responses + validateStatus: (status) => { + return (status >= 200 && status < 300) || status === 304; + }, }; const response = await axios.get(source, config); diff --git a/src/scraper/strategies/WebScraperStrategy.ts b/src/scraper/strategies/WebScraperStrategy.ts index ce1af2ae..4fa4e9ed 100644 --- a/src/scraper/strategies/WebScraperStrategy.ts +++ b/src/scraper/strategies/WebScraperStrategy.ts @@ -51,9 +51,6 @@ export class WebScraperStrategy extends BaseScraperStrategy { const { url } = item; try { - // Check if this is a refresh operation (has pageId and etag) - const isRefresh = item.pageId !== undefined && item.etag !== undefined; - // Define fetch options, passing signal, followRedirects, headers, and etag const fetchOptions = { signal, @@ -65,23 +62,9 @@ export class WebScraperStrategy extends BaseScraperStrategy { // Use AutoDetectFetcher which handles fallbacks automatically const rawContent: RawContent = await this.fetcher.fetch(url, fetchOptions); - // Handle NOT_MODIFIED status (HTTP 304) - if (rawContent.status === FetchStatus.NOT_MODIFIED) { - if (isRefresh) { - logger.debug(`✓ Page unchanged (304): ${url}`); - // Return empty result, no processing needed - return { url, links: [], status: FetchStatus.NOT_MODIFIED }; - } - // For non-refresh operations, 304 shouldn't happen - logger.warn(`⚠️ Unexpected 304 response for non-refresh operation: ${url}`); - return { url, links: [], status: FetchStatus.NOT_MODIFIED }; - } - - // Handle SUCCESS status (HTTP 200) - // For refresh operations with existing pages, mark for deletion before re-adding - const shouldRefresh = isRefresh && item.pageId; - if (shouldRefresh) { - logger.debug(`✓ Refreshing page content: ${url}`); + // Return the status directly - BaseScraperStrategy handles NOT_MODIFIED and NOT_FOUND + if (rawContent.status !== FetchStatus.SUCCESS) { + return { url, links: [], status: rawContent.status }; } // --- Start Pipeline Processing --- @@ -123,32 +106,27 @@ export class WebScraperStrategy extends BaseScraperStrategy { }; } - // For refresh operations, don't extract or follow links - let filteredLinks: string[] = []; - - if (!isRefresh) { - // Determine base for scope filtering: - // For depth 0 (initial page) use the final fetched URL (rawContent.source) so protocol/host redirects don't drop links. - // For deeper pages, use canonicalBaseUrl (set after first page) or fallback to original. - const baseUrl = - item.depth === 0 - ? new URL(rawContent.source) - : (this.canonicalBaseUrl ?? new URL(options.url)); - - filteredLinks = - processed.links?.filter((link) => { - try { - const targetUrl = new URL(link); - const scope = options.scope || "subpages"; - return ( - isInScope(baseUrl, targetUrl, scope) && - (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl)) - ); - } catch { - return false; - } - }) ?? []; - } + // Determine base for scope filtering: + // For depth 0 (initial page) use the final fetched URL (rawContent.source) so protocol/host redirects don't drop links. + // For deeper pages, use canonicalBaseUrl (set after first page) or fallback to original. + const baseUrl = + item.depth === 0 + ? new URL(rawContent.source) + : (this.canonicalBaseUrl ?? new URL(options.url)); + + const filteredLinks = + processed.links?.filter((link) => { + try { + const targetUrl = new URL(link); + const scope = options.scope || "subpages"; + return ( + isInScope(baseUrl, targetUrl, scope) && + (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl)) + ); + } catch { + return false; + } + }) ?? []; return { url, diff --git a/src/store/DocumentManagementService.ts b/src/store/DocumentManagementService.ts index b2895da3..583dd917 100644 --- a/src/store/DocumentManagementService.ts +++ b/src/store/DocumentManagementService.ts @@ -51,10 +51,11 @@ export class DocumentManagementService { embeddingConfig?: EmbeddingModelConfig | null, pipelineConfig?: PipelineConfiguration, ) { - const dbDir = storePath; - const dbPath = path.join(dbDir, "documents.db"); + // Handle special :memory: case for in-memory databases (primarily for testing) + const dbPath = + storePath === ":memory:" ? ":memory:" : path.join(storePath, "documents.db"); - logger.debug(`Using database directory: ${dbDir}`); + logger.debug(`Using database path: ${dbPath}`); // Directory creation is handled by the centralized path resolution diff --git a/src/store/DocumentStore.ts b/src/store/DocumentStore.ts index d23b1767..831cdec6 100644 --- a/src/store/DocumentStore.ts +++ b/src/store/DocumentStore.ts @@ -1049,6 +1049,7 @@ export class DocumentStore { async deleteDocumentsByPageId(pageId: number): Promise { try { const result = this.statements.deleteDocumentsByPageId.run(pageId); + logger.debug(`Deleted ${result.changes} document(s) for page ID ${pageId}`); return result.changes; } catch (error) { throw new ConnectionError("Failed to delete documents by page ID", error); diff --git a/test/refresh-pipeline-e2e.test.ts b/test/refresh-pipeline-e2e.test.ts new file mode 100644 index 00000000..d2f069e9 --- /dev/null +++ b/test/refresh-pipeline-e2e.test.ts @@ -0,0 +1,443 @@ +/** + * End-to-end tests for the refresh pipeline functionality. + * + * These tests validate that the refresh feature correctly handles: + * - Page deletions (404 responses) + * - Page updates (200 responses with new content) + * - Unchanged pages (304 responses) + * - Graceful error handling for broken links during normal scraping + * + * Uses nock to mock HTTP responses and an in-memory database for testing. + */ + +import nock from "nock"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; +import { PipelineManager } from "../src/pipeline/PipelineManager"; +import { ScraperService } from "../src/scraper/ScraperService"; +import type { ScraperOptions } from "../src/scraper/types"; +import { DocumentManagementService } from "../src/store/DocumentManagementService"; +import { DocumentStore } from "../src/store/DocumentStore"; +import type { StoreSearchResult } from "../src/store/types"; +import { ScraperRegistry } from "../src/scraper"; + +describe("Refresh Pipeline E2E Tests", () => { + let docService: DocumentManagementService; + let scraperService: ScraperService; + let pipelineManager: PipelineManager; + + const TEST_BASE_URL = "http://test-docs.example.com"; + const TEST_LIBRARY = "test-lib"; + const TEST_VERSION = "1.0.0"; + + beforeEach(async () => { + // Initialize in-memory store and services + // DocumentManagementService creates its own DocumentStore internally + docService = new DocumentManagementService(":memory:", null); + await docService.initialize(); + const registry = new ScraperRegistry(); + scraperService = new ScraperService(registry); + pipelineManager = new PipelineManager(docService, 3, { recoverJobs: false }); + await pipelineManager.start(); + + // Clear any previous nock mocks + nock.cleanAll(); + }); + + afterEach(async () => { + // Cleanup + await pipelineManager.stop(); + await docService.shutdown(); + nock.cleanAll(); + }); + + describe("Refresh Scenarios", () => { + it("should delete documents when a page returns 404 during refresh", async () => { + // Setup: Mock initial two-page site + nock(TEST_BASE_URL) + .get("/") + .reply(200, "

Home

Page 1Page 2", { + "Content-Type": "text/html", + ETag: '"home-v1"', + }) + .get("/page1") + .reply(200, "

Page 1

Content of page 1

", { + "Content-Type": "text/html", + ETag: '"page1-v1"', + }) + .get("/page2") + .reply(200, "

Page 2

Content of page 2

", { + "Content-Type": "text/html", + ETag: '"page2-v1"', + }); + + // Initial scrape + const initialJobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + // Wait for job to complete + await pipelineManager.waitForJobCompletion(initialJobId); + + // Verify all pages were indexed + const initialSearch = await docService.searchStore(TEST_LIBRARY, TEST_VERSION, "page", 10); + expect(initialSearch.length).toBeGreaterThan(0); + + // Get page IDs for verification + const pages = await docService.getPagesByVersionId( + await docService.ensureVersion({ library: TEST_LIBRARY, version: TEST_VERSION }), + ); + expect(pages.length).toBe(3); // home, page1, page2 + + const page2 = pages.find((p) => p.url === `${TEST_BASE_URL}/page2`); + expect(page2).toBeDefined(); + const page2Id = page2!.id; + + // Setup: Mock refresh with page2 deleted (404) + // Enable nock logging to see what requests are made + nock(TEST_BASE_URL) + .get("/") + .matchHeader("if-none-match", '"home-v1"') + .reply(304, undefined, { ETag: '"home-v1"' }) // Unchanged + .get("/page1") + .matchHeader("if-none-match", '"page1-v1"') + .reply(304, undefined, { ETag: '"page1-v1"' }) // Unchanged + .get("/page2") + .matchHeader("if-none-match", '"page2-v1"') + .reply(404); // Deleted! + + // Execute refresh + const refreshJobId = await pipelineManager.enqueueRefreshJob(TEST_LIBRARY, TEST_VERSION); + await pipelineManager.waitForJobCompletion(refreshJobId); + + // Verify page2 documents were deleted by checking if we can still find page2 content + // Use a unique phrase that only appears in page2 to avoid false positives from keyword matching + const page2Search = await docService.searchStore(TEST_LIBRARY, TEST_VERSION, "Content of page", 10); + const hasPage2Content = page2Search.some((r: StoreSearchResult) => + r.url === `${TEST_BASE_URL}/page2` + ); + expect(hasPage2Content).toBe(false); + + // Verify page1 documents still exist + const page1Search = await docService.searchStore(TEST_LIBRARY, TEST_VERSION, "Content of page 1", 10); + expect(page1Search.length).toBeGreaterThan(0); + }, 30000); + + it("should update documents when a page has changed content during refresh", async () => { + const originalContent = "Original content version 1"; + const updatedContent = "Updated content version 2"; + + // Setup: Mock initial site + nock(TEST_BASE_URL) + .get("/") + .reply( + 200, + `

Home

Page 1`, + { + "Content-Type": "text/html", + ETag: '"home-v1"', + }, + ) + .get("/page1") + .reply( + 200, + `

Page 1

${originalContent}

`, + { + "Content-Type": "text/html", + ETag: '"page1-v1"', + }, + ); + + // Initial scrape + const initialJobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + await pipelineManager.waitForJobCompletion(initialJobId); + + // Verify original content is indexed + const initialSearch = await docService.searchStore( + TEST_LIBRARY, + TEST_VERSION, + "original content", + 10, + ); + expect(initialSearch.length).toBeGreaterThan(0); + expect(initialSearch[0].content).toContain(originalContent); + + // Setup: Mock refresh with updated page1 content + nock(TEST_BASE_URL) + .get("/") + .reply(304, undefined, { ETag: '"home-v1"' }) // Unchanged + .get("/page1") + .reply( + 200, + `

Page 1

${updatedContent}

`, + { + "Content-Type": "text/html", + ETag: '"page1-v2"', // New ETag indicates change + }, + ); + + // Execute refresh + const refreshJobId = await pipelineManager.enqueueRefreshJob(TEST_LIBRARY, TEST_VERSION); + await pipelineManager.waitForJobCompletion(refreshJobId); + + // Verify updated content is now indexed + const updatedSearch = await docService.searchStore( + TEST_LIBRARY, + TEST_VERSION, + "updated content", + 10, + ); + expect(updatedSearch.length).toBeGreaterThan(0); + expect(updatedSearch[0].content).toContain(updatedContent); + + // Verify old content is no longer indexed + const oldSearch = await docService.searchStore( + TEST_LIBRARY, + TEST_VERSION, + "original content", + 10, + ); + const hasOldContent = oldSearch.some((r: StoreSearchResult) => r.content.includes(originalContent)); + expect(hasOldContent).toBe(false); + }, 30000); + + it("should skip processing when pages return 304 Not Modified", async () => { + // Setup: Mock initial site + nock(TEST_BASE_URL) + .get("/") + .reply(200, "

Home

Page 1", { + "Content-Type": "text/html", + ETag: '"home-v1"', + }) + .get("/page1") + .reply(200, "

Page 1

Stable content

", { + "Content-Type": "text/html", + ETag: '"page1-v1"', + }); + + // Initial scrape + const initialJobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + await pipelineManager.waitForJobCompletion(initialJobId); + + // Get initial document count + const versionId = await docService.ensureVersion({ + library: TEST_LIBRARY, + version: TEST_VERSION, + }); + const initialPages = await docService.getPagesByVersionId(versionId); + const initialPageCount = initialPages.length; + + // Setup: Mock refresh with all 304 responses + nock(TEST_BASE_URL) + .get("/") + .reply(304, undefined, { ETag: '"home-v1"' }) + .get("/page1") + .reply(304, undefined, { ETag: '"page1-v1"' }); + + // Execute refresh + const refreshJobId = await pipelineManager.enqueueRefreshJob(TEST_LIBRARY, TEST_VERSION); + await pipelineManager.waitForJobCompletion(refreshJobId); + + // Verify page count hasn't changed + const finalPages = await docService.getPagesByVersionId(versionId); + expect(finalPages.length).toBe(initialPageCount); + + // Verify content is still accessible + const search = await docService.searchStore(TEST_LIBRARY, TEST_VERSION, "stable", 10); + expect(search.length).toBeGreaterThan(0); + }, 30000); + + it("should discover and index new pages during refresh", async () => { + // Setup: Mock initial site with 2 pages + nock(TEST_BASE_URL) + .get("/") + .reply(200, "

Home

Page 1", { + "Content-Type": "text/html", + ETag: '"home-v1"', + }) + .get("/page1") + .reply(200, "

Page 1

Original page

", { + "Content-Type": "text/html", + ETag: '"page1-v1"', + }); + + // Initial scrape + const initialJobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + await pipelineManager.waitForJobCompletion(initialJobId); + + // Verify initial page count + const versionId = await docService.ensureVersion({ + library: TEST_LIBRARY, + version: TEST_VERSION, + }); + const initialPages = await docService.getPagesByVersionId(versionId); + expect(initialPages.length).toBe(2); // home, page1 + + // Setup: Mock refresh where home page now links to a new page2 + nock(TEST_BASE_URL) + .get("/") + .reply( + 200, + "

Home

Page 1Page 2", + { + "Content-Type": "text/html", + ETag: '"home-v2"', // Changed ETag + }, + ) + .get("/page1") + .reply(304, undefined, { ETag: '"page1-v1"' }) // Unchanged + .get("/page2") + .reply(200, "

Page 2

Newly added page

", { + "Content-Type": "text/html", + ETag: '"page2-v1"', + }); + + // Execute refresh + const refreshJobId = await pipelineManager.enqueueRefreshJob(TEST_LIBRARY, TEST_VERSION); + await pipelineManager.waitForJobCompletion(refreshJobId); + + // Verify new page was discovered and indexed + const finalPages = await docService.getPagesByVersionId(versionId); + expect(finalPages.length).toBe(3); // home, page1, page2 + + const page2 = finalPages.find((p) => p.url === `${TEST_BASE_URL}/page2`); + expect(page2).toBeDefined(); + + // Verify new page content is searchable + const search = await docService.searchStore(TEST_LIBRARY, TEST_VERSION, "newly added", 10); + expect(search.length).toBeGreaterThan(0); + }, 30000); + }); + + describe("Standard Scrape Error Handling", () => { + it("should gracefully handle 404 errors for broken links during normal scraping", async () => { + // Setup: Mock site with a broken link + nock(TEST_BASE_URL) + .get("/") + .reply( + 200, + "

Home

ValidBroken", + { + "Content-Type": "text/html", + ETag: '"home-v1"', + }, + ) + .get("/valid-page") + .reply(200, "

Valid Page

This page exists

", { + "Content-Type": "text/html", + ETag: '"valid-v1"', + }) + .get("/broken-link") + .reply(404); // Broken link! + + // Execute scrape + const jobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + // Should complete successfully despite the 404 + await pipelineManager.waitForJobCompletion(jobId); + + const job = await pipelineManager.getJob(jobId); + expect(job?.status).toBe("completed"); + + // Verify valid pages were indexed + const versionId = await docService.ensureVersion({ + library: TEST_LIBRARY, + version: TEST_VERSION, + }); + const pages = await docService.getPagesByVersionId(versionId); + + // Should have home and valid-page, but NOT broken-link + expect(pages.length).toBe(2); + const urls = pages.map((p) => p.url); + expect(urls).toContain(`${TEST_BASE_URL}/`); + expect(urls).toContain(`${TEST_BASE_URL}/valid-page`); + expect(urls).not.toContain(`${TEST_BASE_URL}/broken-link`); + + // Verify valid page content is searchable + const search = await docService.searchStore(TEST_LIBRARY, TEST_VERSION, "exists", 10); + expect(search.length).toBeGreaterThan(0); + }, 30000); + + it("should continue scraping after encountering multiple 404 errors", async () => { + // Setup: Mock site with multiple broken links interspersed with valid ones + nock(TEST_BASE_URL) + .get("/") + .reply( + 200, + "

Home

P1404P2404", + { + "Content-Type": "text/html", + ETag: '"home-v1"', + }, + ) + .get("/page1") + .reply(200, "

Page 1

", { + "Content-Type": "text/html", + ETag: '"page1-v1"', + }) + .get("/404-1") + .reply(404) + .get("/page2") + .reply(200, "

Page 2

", { + "Content-Type": "text/html", + ETag: '"page2-v1"', + }) + .get("/404-2") + .reply(404); + + // Execute scrape + const jobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + await pipelineManager.waitForJobCompletion(jobId); + + // Verify all valid pages were indexed despite multiple 404s + const versionId = await docService.ensureVersion({ + library: TEST_LIBRARY, + version: TEST_VERSION, + }); + const pages = await docService.getPagesByVersionId(versionId); + + expect(pages.length).toBe(3); // home, page1, page2 + const urls = pages.map((p) => p.url); + expect(urls).toContain(`${TEST_BASE_URL}/`); + expect(urls).toContain(`${TEST_BASE_URL}/page1`); + expect(urls).toContain(`${TEST_BASE_URL}/page2`); + }, 30000); + }); +}); From 1b99ec1a017802885fae3dd77b04844b9ab1b81a Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Mon, 10 Nov 2025 08:41:53 +0100 Subject: [PATCH 06/20] refactor(docs): reorganize and expand agent instruction guidelines - Restructure AGENTS.md into clearer sections with improved hierarchy - Add detailed documentation writing principles and structure guidelines - Expand TypeScript development standards with testing and error handling - Include comprehensive commit message format and conventions - Add security, MCP protocol, and pull request guidelines - Improve readability with better formatting and organization The refactoring transforms the instruction file from a flat list into a well-organized reference document with specific sections for documentation, development practices, and contribution standards. --- AGENTS.md | 211 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 134 insertions(+), 77 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 170febea..5d39610f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,84 +1,141 @@ -# Custom Instructions +# Agent Instructions for docs-mcp-server -- The repository for this project is located on GitHub at `arabold/docs-mcp-server`. -- You must read the `README.md` to understand the project structure and setup. -- You must read the `ARCHITECTURE.md` file before making changes across multiple services. -- You must follow DRY, KISS, YAGNI, and SOLID principles. -- You must use the latest version of the programming language and libraries. -- Prefer the simplest solution. -- Never commit secrets, credentials, or sensitive data to the repository. +## Repository Context + +- Repository: `arabold/docs-mcp-server` +- Read `README.md` for project structure and setup +- Read `ARCHITECTURE.md` before making changes across multiple services +- Follow DRY, KISS, YAGNI, and SOLID principles +- Use latest stable versions of programming language and libraries +- Prefer the simplest solution that meets requirements +- Never commit secrets, credentials, or sensitive data ## Documentation -- The `README.md` targets end users that utilize the library for the first time. It should primarily cover prerequisites, installation, configuration, first start, trouble shooting. -- The `ARCHITECTURE.md` targets developers making active changes to the code. It should give a high level overview of the architecture of the library, a feature list, and then reference individual feature documentations in the docs/ folder. -- Write in present tense, describing how the system currently works -- Focus on what the system does, not what it doesn't do or used to do -- Avoid discussing past problems, bugs, or alternative approaches unless directly relevant to understanding the current design -- Use declarative statements rather than explanatory narratives -- Don't include "Important" callouts or emphasis unless documenting critical constraints -- Avoid problem/solution framing - just describe the current behavior and its rationale -- Keep examples focused on illustrating current functionality, not contrasting with previous versions -- Do not create new documentation files unless explicitly asked to. Instead update existing files or create new sections as needed. +### File Targets + +- `README.md` targets end users: prerequisites, installation, configuration, first start, troubleshooting +- `ARCHITECTURE.md` targets active developers: high-level architecture, feature list, references to `docs/` folder +- `docs/` folder provides deep dives into specific features, subsystems, or technical concepts + +### Writing Principles + +- Use present tense to describe current system behavior +- Use declarative statements, not explanatory narratives +- Describe what the system does, not what it doesn't do or used to do +- Avoid problem/solution framing - describe current behavior and rationale +- Omit "Important" callouts unless documenting critical constraints or safety issues +- Keep examples focused on current functionality, not historical comparisons +- Update existing documentation or add sections; only create new files when explicitly requested + +### Structure Guidelines + +- Start with high-level overview before details +- Use clear, descriptive section headers +- Progress from concepts to specifics (allows readers to stop when satisfied) +- Use tables for comparing options, statuses, or behaviors +- Include Mermaid diagrams for workflows, state machines, or component relationships +- Focus on high-level concepts and component relationships (use class/interface names when helpful, as they change less frequently than implementation details) +- Explain architectural decisions with trade-offs +- Avoid explaining source code implementation - use TSDoc comments in source files instead ### Source Code Documentation -- Ensure each source file begins with a comment block summarizing its purpose and logic. -- If no block exists, create one before editing. -- After completing changes, update this block to reflect the changes. -- Always make the comment block clear and concise. - -## Architecture - -- Focus on system concepts and component relationships. -- Put implementation details in source code. -- Update `ARCHITECTURE.md` when the architecture changes. -- Do not use special characters like braces in mermaid diagram titles or names. Quote them if necessary. -- Do not use markdown in mermaid diagrams. - -## TypeScript - -- Install dependencies using `npm install` inside `apps/` instead of adding them to the `package.json` file manually. -- We're using Node.js 22.x, `vite-node` for running TypeScript files, and `vitest` for testing. -- Prefer a specific type or `unknown` over `any`. -- Do not use non-null assertions (`!`). Use optional chaining (`?.`) or nullish coalescing (`??`). -- Follow `biome` for formatting and import order. -- Always place `import` statements at the top of the file. - -## Web UI - -- Use AlpineJS for frontend components and TailwindCSS for styling. -- Use TSX with kitajs for AlpineJS components. -- Use HTMX for server-side interactions. -- Avoid `{foo && }` in TSX; use ternary expressions instead. - -## Logging - -- Use `console.*` for CLI user output (results, direct feedback). -- Use `logger.info/warn/error` for meaningful application events; prefix with a relevant emoji. -- Use `logger.debug` for detailed developer/tracing logs; no emoji prefix. -- Prefer `logger.debug` over `logger.info` for granular internal steps to reduce log verbosity. - -## Testing - -- Consider maintainability and efforts when writing tests. -- Always create unit test files alongside the source file with `.test.ts` suffix. -- Focus on high value, low effort tests first. Defer complex mocking, complex state management testing and concurrent processing unless explicitly requested by the user. -- Always test the intended bevavior, not the implementation details. -- Avoid timing sensitive tests unless absolutely necessary. -- Use `npx vite-node` to run individual TypeScript files. - -## Git - -- Branches must be created locally before pushing. -- Branch names must be prefixed with type (`feature/`, `bugfix/`, `chore/`) and include the issue number if available (e.g., `feature/1234-description`). -- All commit messages must use Conventional Commits (`feat:`, `fix:`, etc.). -- Commit subject must be imperative mood and ≤72 characters. -- If a commit body is present, add a blank line before it. -- Commit body (for non-trivial changes) must explain what and why, not how. -- Reference related issues in commit messages when relevant (e.g., `Closes #123`). -- Do not include unrelated changes in a single commit. -- Do not use vague or generic commit messages. -- Pull request descriptions must summarize the what and why of all changes in the branch (not just a list of commits or the how). -- Pull requests must target `main` unless specified otherwise. -- When creating new GitHub issues, use built-in labels to categorize them (e.g., `bug`, `enhancement`, `documentation`) but avoid creating new labels unless explicitly asked to. +- Document source code with TSDoc comments (not in separate documentation files) +- Each source file must begin with a comment block summarizing purpose and logic +- Create the comment block before editing if it doesn't exist +- Update the comment block after completing changes +- Keep comment blocks clear and concise + +## Architecture Documentation + +- Focus on system concepts and component relationships +- Place implementation details in source code, not architecture docs +- Update `ARCHITECTURE.md` when architecture changes +- In Mermaid diagrams: + - Avoid special characters (e.g., braces) in titles or names; quote if necessary + - Do not use markdown formatting + +## TypeScript Conventions + +### Dependencies and Tooling + +- Install dependencies via `npm install` (not by manually editing `package.json`) +- Runtime: Node.js 22.x +- Execution: `vite-node` for running TypeScript files +- Testing: `vitest` + +### Type Safety + +- Prefer specific types or `unknown` over `any` +- Avoid non-null assertions (`!`) +- Use optional chaining (`?.`) and nullish coalescing (`??`) + +### Code Style + +- Follow `biome` for formatting and import order +- Place all `import` statements at the top of files + +## Web UI Stack + +- Frontend components: AlpineJS +- Styling: TailwindCSS +- AlpineJS components: TSX with kitajs +- Server-side interactions: HTMX +- TSX pattern: Use ternary expressions (`{foo ? : null}`), not short-circuit evaluation (`{foo && }`) + +## Logging Strategy + +### Output Channels + +- `console.*`: CLI user output (results, direct feedback to user) +- `logger.info/warn/error`: Meaningful application events (prefix with relevant emoji) +- `logger.debug`: Detailed developer/tracing logs (no emoji prefix) + +### Verbosity Control + +- Prefer `logger.debug` over `logger.info` for granular internal steps +- Reduces default log verbosity while maintaining debugging capability + +## Testing Approach + +### Test Files + +- Create unit test files alongside source files with `.test.ts` suffix +- Run individual TypeScript files: `npx vite-node ` + +### Test Strategy + +- Prioritize high-value, low-effort tests +- Test intended behavior, not implementation details +- Defer complex mocking, state management testing, and concurrent processing unless explicitly requested +- Avoid timing-sensitive tests unless absolutely necessary +- Balance maintainability with test coverage + +## Git Workflow + +### Branching + +- Create branches locally before pushing +- Branch naming: `/-` (e.g., `feature/1234-add-refresh-logic`) +- Types: `feature/`, `bugfix/`, `chore/` + +### Commits + +- Format: Conventional Commits (`feat:`, `fix:`, `docs:`, `refactor:`, `test:`, `chore:`) +- Subject: Imperative mood, ≤72 characters +- Body: Separate from subject with blank line +- Body content: Explain what and why, not how (for non-trivial changes) +- Reference issues when relevant (e.g., `Closes #123`) +- One logical change per commit (no unrelated changes) +- Avoid vague messages (e.g., "fix bug", "update code") + +### Pull Requests + +- Description: Summarize what and why of all changes (not just commit list or how) +- Target: `main` branch unless specified otherwise + +### Issues + +- Use built-in labels to categorize (e.g., `bug`, `enhancement`, `documentation`) +- Avoid creating new labels unless explicitly requested From 50e69c2eaddaa381645d273291f0dd4ede775466 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Mon, 10 Nov 2025 08:44:10 +0100 Subject: [PATCH 07/20] chore: update dependencies and devDependencies to latest versions --- biome.json | 2 +- package-lock.json | 1094 ++++++++++++++++----------------------------- package.json | 50 +-- 3 files changed, 410 insertions(+), 736 deletions(-) diff --git a/biome.json b/biome.json index 4547507c..d6f11719 100644 --- a/biome.json +++ b/biome.json @@ -1,5 +1,5 @@ { - "$schema": "https://biomejs.dev/schemas/2.2.0/schema.json", + "$schema": "https://biomejs.dev/schemas/2.3.2/schema.json", "assist": { "actions": { "source": { diff --git a/package-lock.json b/package-lock.json index 4e6c5f09..8184dbe5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,7 +11,7 @@ "license": "MIT", "dependencies": { "@fastify/formbody": "^8.0.2", - "@fastify/static": "^8.2.0", + "@fastify/static": "^8.3.0", "@joplin/turndown-plugin-gfm": "^1.0.62", "@kitajs/html": "^4.2.9", "@kitajs/ts-html-plugin": "^4.1.1", @@ -19,57 +19,57 @@ "@langchain/google-genai": "^0.2.16", "@langchain/google-vertexai": "^0.2.16", "@langchain/openai": "^0.6.3", - "@modelcontextprotocol/sdk": "^1.17.1", - "@trpc/client": "^11.4.4", + "@modelcontextprotocol/sdk": "^1.20.2", + "@trpc/client": "^11.7.1", "@trpc/server": "^11.4.4", "alpinejs": "^3.14.9", - "axios": "^1.11.0", + "axios": "^1.13.1", "axios-retry": "^4.5.0", - "better-sqlite3": "^12.2.0", + "better-sqlite3": "^12.4.1", "cheerio": "^1.1.2", "commander": "^14.0.0", - "dompurify": "^3.2.6", - "dotenv": "^17.2.1", + "dompurify": "^3.3.0", + "dotenv": "^17.2.3", "env-paths": "^3.0.0", - "fastify": "^5.4.0", + "fastify": "^5.6.1", "flowbite": "^3.1.2", "fuse.js": "^7.1.0", - "header-generator": "^2.1.69", + "header-generator": "^2.1.76", "htmx.org": "^2.0.6", "iconv-lite": "^0.6.3", - "jose": "^6.0.12", + "jose": "^6.1.0", "jsdom": "^26.1.0", "langchain": "^0.3.30", - "mime": "^4.0.7", + "mime": "^4.1.0", "minimatch": "^10.0.1", "playwright": "^1.52.0", - "posthog-node": "^5.7.0", + "posthog-node": "^5.11.0", "psl": "^1.15.0", "remark": "^15.0.1", "remark-gfm": "^4.0.1", "remark-html": "^16.0.1", - "semver": "^7.7.2", + "semver": "^7.7.3", "sqlite-vec": "^0.1.7-alpha.2", "tree-sitter": "^0.21.1", "tree-sitter-javascript": "^0.23.1", "tree-sitter-python": "^0.21.0", "tree-sitter-typescript": "^0.23.2", - "turndown": "^7.2.0", - "zod": "^4.0.14" + "turndown": "^7.2.2", + "zod": "^4.1.12" }, "bin": { "docs-mcp-server": "dist/index.js" }, "devDependencies": { - "@biomejs/biome": "^2.1.3", + "@biomejs/biome": "^2.3.2", "@commitlint/cli": "^19.8.1", "@commitlint/config-conventional": "^19.8.1", "@semantic-release/changelog": "^6.0.3", "@semantic-release/git": "^10.0.1", - "@semantic-release/github": "^11.0.3", + "@semantic-release/github": "^11.0.6", "@semantic-release/npm": "^12.0.2", - "@tailwindcss/postcss": "^4.1.11", - "@tailwindcss/vite": "^4.1.11", + "@tailwindcss/postcss": "^4.1.16", + "@tailwindcss/vite": "^4.1.16", "@types/alpinejs": "^3.13.11", "@types/better-sqlite3": "^7.6.13", "@types/jsdom": "~21.1.7", @@ -77,19 +77,19 @@ "@types/node": "^24.1.0", "@types/node-fetch": "^2.6.13", "@types/psl": "^1.1.3", - "@types/semver": "^7.7.0", - "@types/turndown": "^5.0.5", + "@types/semver": "^7.7.1", + "@types/turndown": "^5.0.6", "autoprefixer": "^10.4.21", "flowbite-typography": "^1.0.5", "husky": "^9.1.7", - "lint-staged": "^16.1.2", - "memfs": "^4.34.0", + "lint-staged": "^16.2.6", + "memfs": "^4.50.0", "nock": "^14.0.10", "npm-run-all": "^4.1.5", "postcss": "^8.5.6", - "semantic-release": "^24.2.7", + "semantic-release": "^24.2.9", "tailwindcss": "^4.1.4", - "typescript": "^5.9.2", + "typescript": "^5.9.3", "vite": "^6.3.5", "vite-node": "^3.1.2", "vite-plugin-dts": "^4.5.4", @@ -1009,9 +1009,9 @@ } }, "node_modules/@biomejs/biome": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/biome/-/biome-2.2.0.tgz", - "integrity": "sha512-3On3RSYLsX+n9KnoSgfoYlckYBoU6VRM22cw1gB4Y0OuUVSYd/O/2saOJMrA4HFfA1Ff0eacOvMN1yAAvHtzIw==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/biome/-/biome-2.3.2.tgz", + "integrity": "sha512-8e9tzamuDycx7fdrcJ/F/GDZ8SYukc5ud6tDicjjFqURKYFSWMl0H0iXNXZEGmcmNUmABgGuHThPykcM41INgg==", "dev": true, "license": "MIT OR Apache-2.0", "bin": { @@ -1025,20 +1025,20 @@ "url": "https://opencollective.com/biome" }, "optionalDependencies": { - "@biomejs/cli-darwin-arm64": "2.2.0", - "@biomejs/cli-darwin-x64": "2.2.0", - "@biomejs/cli-linux-arm64": "2.2.0", - "@biomejs/cli-linux-arm64-musl": "2.2.0", - "@biomejs/cli-linux-x64": "2.2.0", - "@biomejs/cli-linux-x64-musl": "2.2.0", - "@biomejs/cli-win32-arm64": "2.2.0", - "@biomejs/cli-win32-x64": "2.2.0" + "@biomejs/cli-darwin-arm64": "2.3.2", + "@biomejs/cli-darwin-x64": "2.3.2", + "@biomejs/cli-linux-arm64": "2.3.2", + "@biomejs/cli-linux-arm64-musl": "2.3.2", + "@biomejs/cli-linux-x64": "2.3.2", + "@biomejs/cli-linux-x64-musl": "2.3.2", + "@biomejs/cli-win32-arm64": "2.3.2", + "@biomejs/cli-win32-x64": "2.3.2" } }, "node_modules/@biomejs/cli-darwin-arm64": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/cli-darwin-arm64/-/cli-darwin-arm64-2.2.0.tgz", - "integrity": "sha512-zKbwUUh+9uFmWfS8IFxmVD6XwqFcENjZvEyfOxHs1epjdH3wyyMQG80FGDsmauPwS2r5kXdEM0v/+dTIA9FXAg==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/cli-darwin-arm64/-/cli-darwin-arm64-2.3.2.tgz", + "integrity": "sha512-4LECm4kc3If0JISai4c3KWQzukoUdpxy4fRzlrPcrdMSRFksR9ZoXK7JBcPuLBmd2SoT4/d7CQS33VnZpgBjew==", "cpu": [ "arm64" ], @@ -1053,9 +1053,9 @@ } }, "node_modules/@biomejs/cli-darwin-x64": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/cli-darwin-x64/-/cli-darwin-x64-2.2.0.tgz", - "integrity": "sha512-+OmT4dsX2eTfhD5crUOPw3RPhaR+SKVspvGVmSdZ9y9O/AgL8pla6T4hOn1q+VAFBHuHhsdxDRJgFCSC7RaMOw==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/cli-darwin-x64/-/cli-darwin-x64-2.3.2.tgz", + "integrity": "sha512-jNMnfwHT4N3wi+ypRfMTjLGnDmKYGzxVr1EYAPBcauRcDnICFXN81wD6wxJcSUrLynoyyYCdfW6vJHS/IAoTDA==", "cpu": [ "x64" ], @@ -1070,9 +1070,9 @@ } }, "node_modules/@biomejs/cli-linux-arm64": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-arm64/-/cli-linux-arm64-2.2.0.tgz", - "integrity": "sha512-6eoRdF2yW5FnW9Lpeivh7Mayhq0KDdaDMYOJnH9aT02KuSIX5V1HmWJCQQPwIQbhDh68Zrcpl8inRlTEan0SXw==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-arm64/-/cli-linux-arm64-2.3.2.tgz", + "integrity": "sha512-amnqvk+gWybbQleRRq8TMe0rIv7GHss8mFJEaGuEZYWg1Tw14YKOkeo8h6pf1c+d3qR+JU4iT9KXnBKGON4klw==", "cpu": [ "arm64" ], @@ -1087,9 +1087,9 @@ } }, "node_modules/@biomejs/cli-linux-arm64-musl": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-arm64-musl/-/cli-linux-arm64-musl-2.2.0.tgz", - "integrity": "sha512-egKpOa+4FL9YO+SMUMLUvf543cprjevNc3CAgDNFLcjknuNMcZ0GLJYa3EGTCR2xIkIUJDVneBV3O9OcIlCEZQ==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-arm64-musl/-/cli-linux-arm64-musl-2.3.2.tgz", + "integrity": "sha512-2Zz4usDG1GTTPQnliIeNx6eVGGP2ry5vE/v39nT73a3cKN6t5H5XxjcEoZZh62uVZvED7hXXikclvI64vZkYqw==", "cpu": [ "arm64" ], @@ -1104,9 +1104,9 @@ } }, "node_modules/@biomejs/cli-linux-x64": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-x64/-/cli-linux-x64-2.2.0.tgz", - "integrity": "sha512-5UmQx/OZAfJfi25zAnAGHUMuOd+LOsliIt119x2soA2gLggQYrVPA+2kMUxR6Mw5M1deUF/AWWP2qpxgH7Nyfw==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-x64/-/cli-linux-x64-2.3.2.tgz", + "integrity": "sha512-8BG/vRAhFz1pmuyd24FQPhNeueLqPtwvZk6yblABY2gzL2H8fLQAF/Z2OPIc+BPIVPld+8cSiKY/KFh6k81xfA==", "cpu": [ "x64" ], @@ -1121,9 +1121,9 @@ } }, "node_modules/@biomejs/cli-linux-x64-musl": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-x64-musl/-/cli-linux-x64-musl-2.2.0.tgz", - "integrity": "sha512-I5J85yWwUWpgJyC1CcytNSGusu2p9HjDnOPAFG4Y515hwRD0jpR9sT9/T1cKHtuCvEQ/sBvx+6zhz9l9wEJGAg==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-x64-musl/-/cli-linux-x64-musl-2.3.2.tgz", + "integrity": "sha512-gzB19MpRdTuOuLtPpFBGrV3Lq424gHyq2lFj8wfX9tvLMLdmA/R9C7k/mqBp/spcbWuHeIEKgEs3RviOPcWGBA==", "cpu": [ "x64" ], @@ -1138,9 +1138,9 @@ } }, "node_modules/@biomejs/cli-win32-arm64": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/cli-win32-arm64/-/cli-win32-arm64-2.2.0.tgz", - "integrity": "sha512-n9a1/f2CwIDmNMNkFs+JI0ZjFnMO0jdOyGNtihgUNFnlmd84yIYY2KMTBmMV58ZlVHjgmY5Y6E1hVTnSRieggA==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/cli-win32-arm64/-/cli-win32-arm64-2.3.2.tgz", + "integrity": "sha512-lCruqQlfWjhMlOdyf5pDHOxoNm4WoyY2vZ4YN33/nuZBRstVDuqPPjS0yBkbUlLEte11FbpW+wWSlfnZfSIZvg==", "cpu": [ "arm64" ], @@ -1155,9 +1155,9 @@ } }, "node_modules/@biomejs/cli-win32-x64": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/cli-win32-x64/-/cli-win32-x64-2.2.0.tgz", - "integrity": "sha512-Nawu5nHjP/zPKTIryh2AavzTc/KEg4um/MxWdXW0A6P/RZOyIpa7+QSjeXwAwX/utJGaCoXRPWtF3m5U/bB3Ww==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/cli-win32-x64/-/cli-win32-x64-2.3.2.tgz", + "integrity": "sha512-6Ee9P26DTb4D8sN9nXxgbi9Dw5vSOfH98M7UlmkjKB2vtUbrRqCbZiNfryGiwnPIpd6YUoTl7rLVD2/x1CyEHQ==", "cpu": [ "x64" ], @@ -2161,9 +2161,9 @@ } }, "node_modules/@fastify/static": { - "version": "8.2.0", - "resolved": "https://registry.npmjs.org/@fastify/static/-/static-8.2.0.tgz", - "integrity": "sha512-PejC/DtT7p1yo3p+W7LiUtLMsV8fEvxAK15sozHy9t8kwo5r0uLYmhV/inURmGz1SkHZFz/8CNtHLPyhKcx4SQ==", + "version": "8.3.0", + "resolved": "https://registry.npmjs.org/@fastify/static/-/static-8.3.0.tgz", + "integrity": "sha512-yKxviR5PH1OKNnisIzZKmgZSus0r2OZb8qCSbqmw34aolT4g3UlzYfeBRym+HJ1J471CR8e2ldNub4PubD1coA==", "funding": [ { "type": "github", @@ -2231,19 +2231,6 @@ "node": ">=12" } }, - "node_modules/@isaacs/fs-minipass": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/@isaacs/fs-minipass/-/fs-minipass-4.0.1.tgz", - "integrity": "sha512-wgm9Ehl2jpeqP3zw/7mo3kRHFp5MEDhqAdwy1fTGkHAwnkGOVsgpvQhL8B5n1qlb01jV3n/bI0ZfZp5lWA1k4w==", - "dev": true, - "license": "ISC", - "dependencies": { - "minipass": "^7.0.4" - }, - "engines": { - "node": ">=18.0.0" - } - }, "node_modules/@joplin/turndown-plugin-gfm": { "version": "1.0.62", "resolved": "https://registry.npmjs.org/@joplin/turndown-plugin-gfm/-/turndown-plugin-gfm-1.0.62.tgz", @@ -2290,9 +2277,9 @@ "license": "MIT" }, "node_modules/@jridgewell/trace-mapping": { - "version": "0.3.30", - "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.30.tgz", - "integrity": "sha512-GQ7Nw5G2lTu/BtHTKfXhKHok2WGetd4XYcVKGx00SjAk8GMwgJM3zr6zORiPGuOE+/vkc90KtTosSSvaCjKb2Q==", + "version": "0.3.31", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz", + "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==", "dev": true, "license": "MIT", "dependencies": { @@ -2928,9 +2915,9 @@ "license": "BSD-2-Clause" }, "node_modules/@modelcontextprotocol/sdk": { - "version": "1.17.3", - "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-1.17.3.tgz", - "integrity": "sha512-JPwUKWSsbzx+DLFznf/QZ32Qa+ptfbUlHhRLrBQBAFu9iI1iYvizM4p+zhhRDceSsPutXp4z+R/HPVphlIiclg==", + "version": "1.20.2", + "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-1.20.2.tgz", + "integrity": "sha512-6rqTdFt67AAAzln3NOKsXRmv5ZzPkgbfaebKBqUbts7vK1GZudqnrun5a8d3M/h955cam9RHZ6Jb4Y1XhnmFPg==", "license": "MIT", "dependencies": { "ajv": "^6.12.6", @@ -3008,44 +2995,6 @@ "node": ">=18" } }, - "node_modules/@nodelib/fs.scandir": { - "version": "2.1.5", - "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", - "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==", - "dev": true, - "license": "MIT", - "dependencies": { - "@nodelib/fs.stat": "2.0.5", - "run-parallel": "^1.1.9" - }, - "engines": { - "node": ">= 8" - } - }, - "node_modules/@nodelib/fs.stat": { - "version": "2.0.5", - "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz", - "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 8" - } - }, - "node_modules/@nodelib/fs.walk": { - "version": "1.2.8", - "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz", - "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==", - "dev": true, - "license": "MIT", - "dependencies": { - "@nodelib/fs.scandir": "2.1.5", - "fastq": "^1.6.0" - }, - "engines": { - "node": ">= 8" - } - }, "node_modules/@octokit/auth-token": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/@octokit/auth-token/-/auth-token-6.0.0.tgz", @@ -3283,9 +3232,9 @@ } }, "node_modules/@posthog/core": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/@posthog/core/-/core-1.0.0.tgz", - "integrity": "sha512-gquQld+duT9DdzLIFoHZkUMW0DZOTSLCtSjuuC/zKFz65Qecbz9p37DHBJMkw0dCuB8Mgh2GtH8Ag3PznJrP3g==", + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@posthog/core/-/core-1.5.0.tgz", + "integrity": "sha512-oxfV20QMNwH30jKybUyqi3yGuMghULQz1zkJgQG3rjpHDxhD2vDN6E7UpmaqgphMIvGG3Q+DgfU10zfSPA7w7w==", "license": "MIT" }, "node_modules/@rollup/plugin-node-resolve": { @@ -3899,9 +3848,9 @@ } }, "node_modules/@semantic-release/github": { - "version": "11.0.4", - "resolved": "https://registry.npmjs.org/@semantic-release/github/-/github-11.0.4.tgz", - "integrity": "sha512-fU/nLSjkp9DmB0h7FVO5imhhWJMvq2LjD4+3lz3ZAzpDLY9+KYwC+trJ+g7LbZeJv9y3L9fSFSg2DduUpiT6bw==", + "version": "11.0.6", + "resolved": "https://registry.npmjs.org/@semantic-release/github/-/github-11.0.6.tgz", + "integrity": "sha512-ctDzdSMrT3H+pwKBPdyCPty6Y47X8dSrjd3aPZ5KKIKKWTwZBE9De8GtsH3TyAlw3Uyo2stegMx6rJMXKpJwJA==", "dev": true, "license": "MIT", "dependencies": { @@ -3913,13 +3862,13 @@ "aggregate-error": "^5.0.0", "debug": "^4.3.4", "dir-glob": "^3.0.1", - "globby": "^14.0.0", "http-proxy-agent": "^7.0.0", "https-proxy-agent": "^7.0.0", "issue-parser": "^7.0.0", "lodash-es": "^4.17.21", "mime": "^4.0.0", "p-filter": "^4.0.0", + "tinyglobby": "^0.2.14", "url-join": "^5.0.0" }, "engines": { @@ -4310,19 +4259,6 @@ "url": "https://github.com/sindresorhus/is?sponsor=1" } }, - "node_modules/@sindresorhus/merge-streams": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/@sindresorhus/merge-streams/-/merge-streams-2.3.0.tgz", - "integrity": "sha512-LtoMMhxAlorcGhmFYI+LhPgbPZCkgP6ra1YL604EeF6U98pLlQ3iWIGMdWSC+vWmPBWBNgmDBAhnAobLROJmwg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/@smithy/abort-controller": { "version": "4.0.5", "resolved": "https://registry.npmjs.org/@smithy/abort-controller/-/abort-controller-4.0.5.tgz", @@ -4964,54 +4900,49 @@ } }, "node_modules/@tailwindcss/node": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/node/-/node-4.1.12.tgz", - "integrity": "sha512-3hm9brwvQkZFe++SBt+oLjo4OLDtkvlE8q2WalaD/7QWaeM7KEJbAiY/LJZUaCs7Xa8aUu4xy3uoyX4q54UVdQ==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/node/-/node-4.1.16.tgz", + "integrity": "sha512-BX5iaSsloNuvKNHRN3k2RcCuTEgASTo77mofW0vmeHkfrDWaoFAFvNHpEgtu0eqyypcyiBkDWzSMxJhp3AUVcw==", "dev": true, "license": "MIT", "dependencies": { "@jridgewell/remapping": "^2.3.4", "enhanced-resolve": "^5.18.3", - "jiti": "^2.5.1", - "lightningcss": "1.30.1", - "magic-string": "^0.30.17", + "jiti": "^2.6.1", + "lightningcss": "1.30.2", + "magic-string": "^0.30.19", "source-map-js": "^1.2.1", - "tailwindcss": "4.1.12" + "tailwindcss": "4.1.16" } }, "node_modules/@tailwindcss/oxide": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.12.tgz", - "integrity": "sha512-gM5EoKHW/ukmlEtphNwaGx45fGoEmP10v51t9unv55voWh6WrOL19hfuIdo2FjxIaZzw776/BUQg7Pck++cIVw==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.16.tgz", + "integrity": "sha512-2OSv52FRuhdlgyOQqgtQHuCgXnS8nFSYRp2tJ+4WZXKgTxqPy7SMSls8c3mPT5pkZ17SBToGM5LHEJBO7miEdg==", "dev": true, - "hasInstallScript": true, "license": "MIT", - "dependencies": { - "detect-libc": "^2.0.4", - "tar": "^7.4.3" - }, "engines": { "node": ">= 10" }, "optionalDependencies": { - "@tailwindcss/oxide-android-arm64": "4.1.12", - "@tailwindcss/oxide-darwin-arm64": "4.1.12", - "@tailwindcss/oxide-darwin-x64": "4.1.12", - "@tailwindcss/oxide-freebsd-x64": "4.1.12", - "@tailwindcss/oxide-linux-arm-gnueabihf": "4.1.12", - "@tailwindcss/oxide-linux-arm64-gnu": "4.1.12", - "@tailwindcss/oxide-linux-arm64-musl": "4.1.12", - "@tailwindcss/oxide-linux-x64-gnu": "4.1.12", - "@tailwindcss/oxide-linux-x64-musl": "4.1.12", - "@tailwindcss/oxide-wasm32-wasi": "4.1.12", - "@tailwindcss/oxide-win32-arm64-msvc": "4.1.12", - "@tailwindcss/oxide-win32-x64-msvc": "4.1.12" + "@tailwindcss/oxide-android-arm64": "4.1.16", + "@tailwindcss/oxide-darwin-arm64": "4.1.16", + "@tailwindcss/oxide-darwin-x64": "4.1.16", + "@tailwindcss/oxide-freebsd-x64": "4.1.16", + "@tailwindcss/oxide-linux-arm-gnueabihf": "4.1.16", + "@tailwindcss/oxide-linux-arm64-gnu": "4.1.16", + "@tailwindcss/oxide-linux-arm64-musl": "4.1.16", + "@tailwindcss/oxide-linux-x64-gnu": "4.1.16", + "@tailwindcss/oxide-linux-x64-musl": "4.1.16", + "@tailwindcss/oxide-wasm32-wasi": "4.1.16", + "@tailwindcss/oxide-win32-arm64-msvc": "4.1.16", + "@tailwindcss/oxide-win32-x64-msvc": "4.1.16" } }, "node_modules/@tailwindcss/oxide-android-arm64": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-android-arm64/-/oxide-android-arm64-4.1.12.tgz", - "integrity": "sha512-oNY5pq+1gc4T6QVTsZKwZaGpBb2N1H1fsc1GD4o7yinFySqIuRZ2E4NvGasWc6PhYJwGK2+5YT1f9Tp80zUQZQ==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-android-arm64/-/oxide-android-arm64-4.1.16.tgz", + "integrity": "sha512-8+ctzkjHgwDJ5caq9IqRSgsP70xhdhJvm+oueS/yhD5ixLhqTw9fSL1OurzMUhBwE5zK26FXLCz2f/RtkISqHA==", "cpu": [ "arm64" ], @@ -5026,9 +4957,9 @@ } }, "node_modules/@tailwindcss/oxide-darwin-arm64": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-darwin-arm64/-/oxide-darwin-arm64-4.1.12.tgz", - "integrity": "sha512-cq1qmq2HEtDV9HvZlTtrj671mCdGB93bVY6J29mwCyaMYCP/JaUBXxrQQQm7Qn33AXXASPUb2HFZlWiiHWFytw==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-darwin-arm64/-/oxide-darwin-arm64-4.1.16.tgz", + "integrity": "sha512-C3oZy5042v2FOALBZtY0JTDnGNdS6w7DxL/odvSny17ORUnaRKhyTse8xYi3yKGyfnTUOdavRCdmc8QqJYwFKA==", "cpu": [ "arm64" ], @@ -5043,9 +4974,9 @@ } }, "node_modules/@tailwindcss/oxide-darwin-x64": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-darwin-x64/-/oxide-darwin-x64-4.1.12.tgz", - "integrity": "sha512-6UCsIeFUcBfpangqlXay9Ffty9XhFH1QuUFn0WV83W8lGdX8cD5/+2ONLluALJD5+yJ7k8mVtwy3zMZmzEfbLg==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-darwin-x64/-/oxide-darwin-x64-4.1.16.tgz", + "integrity": "sha512-vjrl/1Ub9+JwU6BP0emgipGjowzYZMjbWCDqwA2Z4vCa+HBSpP4v6U2ddejcHsolsYxwL5r4bPNoamlV0xDdLg==", "cpu": [ "x64" ], @@ -5060,9 +4991,9 @@ } }, "node_modules/@tailwindcss/oxide-freebsd-x64": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-freebsd-x64/-/oxide-freebsd-x64-4.1.12.tgz", - "integrity": "sha512-JOH/f7j6+nYXIrHobRYCtoArJdMJh5zy5lr0FV0Qu47MID/vqJAY3r/OElPzx1C/wdT1uS7cPq+xdYYelny1ww==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-freebsd-x64/-/oxide-freebsd-x64-4.1.16.tgz", + "integrity": "sha512-TSMpPYpQLm+aR1wW5rKuUuEruc/oOX3C7H0BTnPDn7W/eMw8W+MRMpiypKMkXZfwH8wqPIRKppuZoedTtNj2tg==", "cpu": [ "x64" ], @@ -5077,9 +5008,9 @@ } }, "node_modules/@tailwindcss/oxide-linux-arm-gnueabihf": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm-gnueabihf/-/oxide-linux-arm-gnueabihf-4.1.12.tgz", - "integrity": "sha512-v4Ghvi9AU1SYgGr3/j38PD8PEe6bRfTnNSUE3YCMIRrrNigCFtHZ2TCm8142X8fcSqHBZBceDx+JlFJEfNg5zQ==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm-gnueabihf/-/oxide-linux-arm-gnueabihf-4.1.16.tgz", + "integrity": "sha512-p0GGfRg/w0sdsFKBjMYvvKIiKy/LNWLWgV/plR4lUgrsxFAoQBFrXkZ4C0w8IOXfslB9vHK/JGASWD2IefIpvw==", "cpu": [ "arm" ], @@ -5094,9 +5025,9 @@ } }, "node_modules/@tailwindcss/oxide-linux-arm64-gnu": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm64-gnu/-/oxide-linux-arm64-gnu-4.1.12.tgz", - "integrity": "sha512-YP5s1LmetL9UsvVAKusHSyPlzSRqYyRB0f+Kl/xcYQSPLEw/BvGfxzbH+ihUciePDjiXwHh+p+qbSP3SlJw+6g==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm64-gnu/-/oxide-linux-arm64-gnu-4.1.16.tgz", + "integrity": "sha512-DoixyMmTNO19rwRPdqviTrG1rYzpxgyYJl8RgQvdAQUzxC1ToLRqtNJpU/ATURSKgIg6uerPw2feW0aS8SNr/w==", "cpu": [ "arm64" ], @@ -5111,9 +5042,9 @@ } }, "node_modules/@tailwindcss/oxide-linux-arm64-musl": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm64-musl/-/oxide-linux-arm64-musl-4.1.12.tgz", - "integrity": "sha512-V8pAM3s8gsrXcCv6kCHSuwyb/gPsd863iT+v1PGXC4fSL/OJqsKhfK//v8P+w9ThKIoqNbEnsZqNy+WDnwQqCA==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm64-musl/-/oxide-linux-arm64-musl-4.1.16.tgz", + "integrity": "sha512-H81UXMa9hJhWhaAUca6bU2wm5RRFpuHImrwXBUvPbYb+3jo32I9VIwpOX6hms0fPmA6f2pGVlybO6qU8pF4fzQ==", "cpu": [ "arm64" ], @@ -5128,9 +5059,9 @@ } }, "node_modules/@tailwindcss/oxide-linux-x64-gnu": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-x64-gnu/-/oxide-linux-x64-gnu-4.1.12.tgz", - "integrity": "sha512-xYfqYLjvm2UQ3TZggTGrwxjYaLB62b1Wiysw/YE3Yqbh86sOMoTn0feF98PonP7LtjsWOWcXEbGqDL7zv0uW8Q==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-x64-gnu/-/oxide-linux-x64-gnu-4.1.16.tgz", + "integrity": "sha512-ZGHQxDtFC2/ruo7t99Qo2TTIvOERULPl5l0K1g0oK6b5PGqjYMga+FcY1wIUnrUxY56h28FxybtDEla+ICOyew==", "cpu": [ "x64" ], @@ -5145,9 +5076,9 @@ } }, "node_modules/@tailwindcss/oxide-linux-x64-musl": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-x64-musl/-/oxide-linux-x64-musl-4.1.12.tgz", - "integrity": "sha512-ha0pHPamN+fWZY7GCzz5rKunlv9L5R8kdh+YNvP5awe3LtuXb5nRi/H27GeL2U+TdhDOptU7T6Is7mdwh5Ar3A==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-x64-musl/-/oxide-linux-x64-musl-4.1.16.tgz", + "integrity": "sha512-Oi1tAaa0rcKf1Og9MzKeINZzMLPbhxvm7rno5/zuP1WYmpiG0bEHq4AcRUiG2165/WUzvxkW4XDYCscZWbTLZw==", "cpu": [ "x64" ], @@ -5162,9 +5093,9 @@ } }, "node_modules/@tailwindcss/oxide-wasm32-wasi": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-wasm32-wasi/-/oxide-wasm32-wasi-4.1.12.tgz", - "integrity": "sha512-4tSyu3dW+ktzdEpuk6g49KdEangu3eCYoqPhWNsZgUhyegEda3M9rG0/j1GV/JjVVsj+lG7jWAyrTlLzd/WEBg==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-wasm32-wasi/-/oxide-wasm32-wasi-4.1.16.tgz", + "integrity": "sha512-B01u/b8LteGRwucIBmCQ07FVXLzImWESAIMcUU6nvFt/tYsQ6IHz8DmZ5KtvmwxD+iTYBtM1xwoGXswnlu9v0Q==", "bundleDependencies": [ "@napi-rs/wasm-runtime", "@emnapi/core", @@ -5180,21 +5111,21 @@ "license": "MIT", "optional": true, "dependencies": { - "@emnapi/core": "^1.4.5", - "@emnapi/runtime": "^1.4.5", - "@emnapi/wasi-threads": "^1.0.4", - "@napi-rs/wasm-runtime": "^0.2.12", - "@tybys/wasm-util": "^0.10.0", - "tslib": "^2.8.0" + "@emnapi/core": "^1.5.0", + "@emnapi/runtime": "^1.5.0", + "@emnapi/wasi-threads": "^1.1.0", + "@napi-rs/wasm-runtime": "^1.0.7", + "@tybys/wasm-util": "^0.10.1", + "tslib": "^2.4.0" }, "engines": { "node": ">=14.0.0" } }, "node_modules/@tailwindcss/oxide-win32-arm64-msvc": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-win32-arm64-msvc/-/oxide-win32-arm64-msvc-4.1.12.tgz", - "integrity": "sha512-iGLyD/cVP724+FGtMWslhcFyg4xyYyM+5F4hGvKA7eifPkXHRAUDFaimu53fpNg9X8dfP75pXx/zFt/jlNF+lg==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-win32-arm64-msvc/-/oxide-win32-arm64-msvc-4.1.16.tgz", + "integrity": "sha512-zX+Q8sSkGj6HKRTMJXuPvOcP8XfYON24zJBRPlszcH1Np7xuHXhWn8qfFjIujVzvH3BHU+16jBXwgpl20i+v9A==", "cpu": [ "arm64" ], @@ -5209,9 +5140,9 @@ } }, "node_modules/@tailwindcss/oxide-win32-x64-msvc": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-win32-x64-msvc/-/oxide-win32-x64-msvc-4.1.12.tgz", - "integrity": "sha512-NKIh5rzw6CpEodv/++r0hGLlfgT/gFN+5WNdZtvh6wpU2BpGNgdjvj6H2oFc8nCM839QM1YOhjpgbAONUb4IxA==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-win32-x64-msvc/-/oxide-win32-x64-msvc-4.1.16.tgz", + "integrity": "sha512-m5dDFJUEejbFqP+UXVstd4W/wnxA4F61q8SoL+mqTypId2T2ZpuxosNSgowiCnLp2+Z+rivdU0AqpfgiD7yCBg==", "cpu": [ "x64" ], @@ -5226,51 +5157,51 @@ } }, "node_modules/@tailwindcss/postcss": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/postcss/-/postcss-4.1.12.tgz", - "integrity": "sha512-5PpLYhCAwf9SJEeIsSmCDLgyVfdBhdBpzX1OJ87anT9IVR0Z9pjM0FNixCAUAHGnMBGB8K99SwAheXrT0Kh6QQ==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/postcss/-/postcss-4.1.16.tgz", + "integrity": "sha512-Qn3SFGPXYQMKR/UtqS+dqvPrzEeBZHrFA92maT4zijCVggdsXnDBMsPFJo1eArX3J+O+Gi+8pV4PkqjLCNBk3A==", "dev": true, "license": "MIT", "dependencies": { "@alloc/quick-lru": "^5.2.0", - "@tailwindcss/node": "4.1.12", - "@tailwindcss/oxide": "4.1.12", + "@tailwindcss/node": "4.1.16", + "@tailwindcss/oxide": "4.1.16", "postcss": "^8.4.41", - "tailwindcss": "4.1.12" + "tailwindcss": "4.1.16" } }, "node_modules/@tailwindcss/vite": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/vite/-/vite-4.1.12.tgz", - "integrity": "sha512-4pt0AMFDx7gzIrAOIYgYP0KCBuKWqyW8ayrdiLEjoJTT4pKTjrzG/e4uzWtTLDziC+66R9wbUqZBccJalSE5vQ==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/vite/-/vite-4.1.16.tgz", + "integrity": "sha512-bbguNBcDxsRmi9nnlWJxhfDWamY3lmcyACHcdO1crxfzuLpOhHLLtEIN/nCbbAtj5rchUgQD17QVAKi1f7IsKg==", "dev": true, "license": "MIT", "dependencies": { - "@tailwindcss/node": "4.1.12", - "@tailwindcss/oxide": "4.1.12", - "tailwindcss": "4.1.12" + "@tailwindcss/node": "4.1.16", + "@tailwindcss/oxide": "4.1.16", + "tailwindcss": "4.1.16" }, "peerDependencies": { "vite": "^5.2.0 || ^6 || ^7" } }, "node_modules/@trpc/client": { - "version": "11.4.4", - "resolved": "https://registry.npmjs.org/@trpc/client/-/client-11.4.4.tgz", - "integrity": "sha512-86OZl+Y+Xlt9ITGlhCMImERcsWCOrVzpNuzg3XBlsDSmSs9NGsghKjeCpJQlE36XaG3aze+o9pRukiYYvBqxgQ==", + "version": "11.7.1", + "resolved": "https://registry.npmjs.org/@trpc/client/-/client-11.7.1.tgz", + "integrity": "sha512-uOnAjElKI892/U6aQMcBHYs3x7mme3Cvv1F87ytBL56rBvs7+DyK7r43zgaXKf13+GtPEI6ex5xjVUfyDW8XcQ==", "funding": [ "https://trpc.io/sponsor" ], "license": "MIT", "peerDependencies": { - "@trpc/server": "11.4.4", + "@trpc/server": "11.7.1", "typescript": ">=5.7.2" } }, "node_modules/@trpc/server": { - "version": "11.4.4", - "resolved": "https://registry.npmjs.org/@trpc/server/-/server-11.4.4.tgz", - "integrity": "sha512-VkJb2xnb4rCynuwlCvgPBh5aM+Dco6fBBIo6lWAdJJRYVwtyE5bxNZBgUvRRz/cFSEAy0vmzLxF7aABDJfK5Rg==", + "version": "11.7.1", + "resolved": "https://registry.npmjs.org/@trpc/server/-/server-11.7.1.tgz", + "integrity": "sha512-N3U8LNLIP4g9C7LJ/sLkjuPHwqlvE3bnspzC4DEFVdvx2+usbn70P80E3wj5cjOTLhmhRiwJCSXhlB+MHfGeCw==", "funding": [ "https://trpc.io/sponsor" ], @@ -5436,9 +5367,9 @@ "license": "MIT" }, "node_modules/@types/semver": { - "version": "7.7.0", - "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.7.0.tgz", - "integrity": "sha512-k107IF4+Xr7UHjwDc7Cfd6PRQfbdkiRabXGRjo07b4WyPahFBZCZ1sE+BNxYIJPPg73UkfOsVOLwqVc/6ETrIA==", + "version": "7.7.1", + "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.7.1.tgz", + "integrity": "sha512-FmgJfu+MOcQ370SD0ev7EI8TlCAfKYU+B4m5T3yXc1CiRN94g/SZPtsCkk506aUDtlMnFZvasDwHHUcZUEaYuA==", "dev": true, "license": "MIT" }, @@ -5457,9 +5388,9 @@ "optional": true }, "node_modules/@types/turndown": { - "version": "5.0.5", - "resolved": "https://registry.npmjs.org/@types/turndown/-/turndown-5.0.5.tgz", - "integrity": "sha512-TL2IgGgc7B5j78rIccBtlYAnkuv8nUQqhQc+DSYV5j9Be9XOcm/SKOVRuA47xAVI3680Tk9B1d8flK2GWT2+4w==", + "version": "5.0.6", + "resolved": "https://registry.npmjs.org/@types/turndown/-/turndown-5.0.6.tgz", + "integrity": "sha512-ru00MoyeeouE5BX4gRL+6m/BsDfbRayOskWqUvh7CLGW+UXxHQItqALa38kKnOiZPqJrtzJUgAC2+F0rL1S4Pg==", "dev": true, "license": "MIT" }, @@ -6083,9 +6014,9 @@ } }, "node_modules/axios": { - "version": "1.11.0", - "resolved": "https://registry.npmjs.org/axios/-/axios-1.11.0.tgz", - "integrity": "sha512-1Lx3WLFQWm3ooKDYZD1eXmoGO9fxYQjrycfHFC8P0sCfQVXyROp0p9PFWBehewBOdCwHc+f/b8I0fMto5eSfwA==", + "version": "1.13.1", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.13.1.tgz", + "integrity": "sha512-hU4EGxxt+j7TQijx1oYdAjw4xuIp1wRQSsbMFwSthCWeBQur1eF+qJ5iQ5sN3Tw8YRzQNKb8jszgBdMDVqwJcw==", "license": "MIT", "dependencies": { "follow-redirects": "^1.15.6", @@ -6150,9 +6081,9 @@ "license": "Apache-2.0" }, "node_modules/better-sqlite3": { - "version": "12.2.0", - "resolved": "https://registry.npmjs.org/better-sqlite3/-/better-sqlite3-12.2.0.tgz", - "integrity": "sha512-eGbYq2CT+tos1fBwLQ/tkBt9J5M3JEHjku4hbvQUePCckkvVf14xWj+1m7dGoK81M/fOjFT7yM9UMeKT/+vFLQ==", + "version": "12.4.1", + "resolved": "https://registry.npmjs.org/better-sqlite3/-/better-sqlite3-12.4.1.tgz", + "integrity": "sha512-3yVdyZhklTiNrtg+4WqHpJpFDd+WHTg2oM7UcR80GqL05AOV0xEJzc6qNvFYoEtE+hRp1n9MpN6/+4yhlGkDXQ==", "hasInstallScript": true, "license": "MIT", "dependencies": { @@ -6571,16 +6502,6 @@ "url": "https://github.com/sponsors/fb55" } }, - "node_modules/chownr": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/chownr/-/chownr-3.0.0.tgz", - "integrity": "sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==", - "dev": true, - "license": "BlueOak-1.0.0", - "engines": { - "node": ">=18" - } - }, "node_modules/clean-stack": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/clean-stack/-/clean-stack-2.2.0.tgz", @@ -6872,42 +6793,34 @@ } }, "node_modules/cli-truncate": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/cli-truncate/-/cli-truncate-4.0.0.tgz", - "integrity": "sha512-nPdaFdQ0h/GEigbPClz11D0v/ZJEwxmeVZGeMo3Z5StPtUTkA9o1lD6QwoirYiSDzbcwn2XcjwmCp68W1IS4TA==", + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/cli-truncate/-/cli-truncate-5.1.1.tgz", + "integrity": "sha512-SroPvNHxUnk+vIW/dOSfNqdy1sPEFkrTk6TUtqLCnBlo3N7TNYYkzzN7uSD6+jVjrdO4+p8nH7JzH6cIvUem6A==", "dev": true, "license": "MIT", "dependencies": { - "slice-ansi": "^5.0.0", - "string-width": "^7.0.0" + "slice-ansi": "^7.1.0", + "string-width": "^8.0.0" }, "engines": { - "node": ">=18" + "node": ">=20" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/cli-truncate/node_modules/emoji-regex": { - "version": "10.4.0", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.4.0.tgz", - "integrity": "sha512-EC+0oUMY1Rqm4O6LLrgjtYDvcVYTy7chDnM4Q7030tP4Kwj3u/pR6gP9ygnp2CJMK5Gq+9Q2oqmrFJAz01DXjw==", - "dev": true, - "license": "MIT" - }, "node_modules/cli-truncate/node_modules/string-width": { - "version": "7.2.0", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-7.2.0.tgz", - "integrity": "sha512-tsaTIkKW9b4N+AEj+SVA+WhJzV7/zMhcSu78mLKWSk7cXMOSHsBKFWUs0fWwq8QyK3MgJBQRX6Gbi4kYbdvGkQ==", + "version": "8.1.0", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-8.1.0.tgz", + "integrity": "sha512-Kxl3KJGb/gxkaUMOjRsQ8IrXiGW75O4E3RPjFIINOVH8AMl2SQ/yWdTzWwF3FevIX9LcMAjJW+GRwAlAbTSXdg==", "dev": true, "license": "MIT", "dependencies": { - "emoji-regex": "^10.3.0", - "get-east-asian-width": "^1.0.0", + "get-east-asian-width": "^1.3.0", "strip-ansi": "^7.1.0" }, "engines": { - "node": ">=18" + "node": ">=20" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" @@ -7057,9 +6970,9 @@ } }, "node_modules/commander": { - "version": "14.0.0", - "resolved": "https://registry.npmjs.org/commander/-/commander-14.0.0.tgz", - "integrity": "sha512-2uM9rYjPvyq39NwLRqaiLtWHyDC1FvryJDa2ATTVims5YAS4PupsEQsDvP14FqhFr0P49CYDugi59xaxJlTXRA==", + "version": "14.0.2", + "resolved": "https://registry.npmjs.org/commander/-/commander-14.0.2.tgz", + "integrity": "sha512-TywoWNNRbhoD0BXs1P3ZEScW8W5iKrnbithIl0YH+uCmBd0QpPOA8yc82DS3BIE5Ma6FnBVUsJ7wVUDz4dvOWQ==", "license": "MIT", "engines": { "node": ">=20" @@ -7753,9 +7666,9 @@ } }, "node_modules/dompurify": { - "version": "3.2.6", - "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.2.6.tgz", - "integrity": "sha512-/2GogDQlohXPZe6D6NOgQvXLPSYBqIWMnZ8zzOhn09REE4eyAzb+Hed3jhoM9OkuaJ8P6ZGTTVWQKAi8ieIzfQ==", + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.3.0.tgz", + "integrity": "sha512-r+f6MYR1gGN1eJv0TVQbhA7if/U7P87cdPl3HN5rikqaBSBxLiCb/b9O+2eG0cxz0ghyU+mU1QkbsOwERMYlWQ==", "license": "(MPL-2.0 OR Apache-2.0)", "optionalDependencies": { "@types/trusted-types": "^2.0.7" @@ -7789,9 +7702,9 @@ } }, "node_modules/dotenv": { - "version": "17.2.1", - "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.2.1.tgz", - "integrity": "sha512-kQhDYKZecqnM0fCnzI5eIv5L4cAe/iRI+HqMbO/hbRdTAeXDG+M9FjipUxNfbARuEg4iHIbhnhs78BCHNbSxEQ==", + "version": "17.2.3", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.2.3.tgz", + "integrity": "sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w==", "license": "BSD-2-Clause", "engines": { "node": ">=12" @@ -8519,23 +8432,6 @@ "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==", "license": "MIT" }, - "node_modules/fast-glob": { - "version": "3.3.3", - "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.3.tgz", - "integrity": "sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg==", - "dev": true, - "license": "MIT", - "dependencies": { - "@nodelib/fs.stat": "^2.0.2", - "@nodelib/fs.walk": "^1.2.3", - "glob-parent": "^5.1.2", - "merge2": "^1.3.0", - "micromatch": "^4.0.8" - }, - "engines": { - "node": ">=8.6.0" - } - }, "node_modules/fast-json-stable-stringify": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz", @@ -8619,9 +8515,9 @@ } }, "node_modules/fastify": { - "version": "5.5.0", - "resolved": "https://registry.npmjs.org/fastify/-/fastify-5.5.0.tgz", - "integrity": "sha512-ZWSWlzj3K/DcULCnCjEiC2zn2FBPdlZsSA/pnPa/dbUfLvxkD/Nqmb0XXMXLrWkeM4uQPUvjdJpwtXmTfriXqw==", + "version": "5.6.1", + "resolved": "https://registry.npmjs.org/fastify/-/fastify-5.6.1.tgz", + "integrity": "sha512-WjjlOciBF0K8pDUPZoGPhqhKrQJ02I8DKaDIfO51EL0kbSMwQFl85cRwhOvmSDWoukNOdTo27gLN549pLCcH7Q==", "funding": [ { "type": "github", @@ -9148,9 +9044,9 @@ } }, "node_modules/generative-bayesian-network": { - "version": "2.1.70", - "resolved": "https://registry.npmjs.org/generative-bayesian-network/-/generative-bayesian-network-2.1.70.tgz", - "integrity": "sha512-nP0CNiVs/QS5ppMsGiEYN3dgAe3UTT1mpDth0wTh9uEyEO4e7y1Yr5PGDcTJsU0Lm3YM21yNzhuPbUg7etKHbQ==", + "version": "2.1.76", + "resolved": "https://registry.npmjs.org/generative-bayesian-network/-/generative-bayesian-network-2.1.76.tgz", + "integrity": "sha512-e9BByo5UEXPsrOii4RM94a02y1JXhP5XZKbzC5GWDz62Bbh2jWbrkY0ta2cF1rxrv8pqLu4c98yQC2F50Eqa7A==", "license": "Apache-2.0", "dependencies": { "adm-zip": "^0.5.9", @@ -9167,9 +9063,9 @@ } }, "node_modules/get-east-asian-width": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/get-east-asian-width/-/get-east-asian-width-1.3.0.tgz", - "integrity": "sha512-vpeMIQKxczTD/0s2CdEWHcb0eeJe6TFjxb+J5xgX7hScxqrGuyjmv4c1D4A/gelKfyox0gJJwIHF+fLjeaM8kQ==", + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/get-east-asian-width/-/get-east-asian-width-1.4.0.tgz", + "integrity": "sha512-QZjmEOC+IT1uk6Rx0sX22V6uHWVwbdbxf1faPqJ1QhLdGgsRGCZoyaQBm/piRdJy/D2um6hM1UP7ZEeQ4EkP+Q==", "dev": true, "license": "MIT", "engines": { @@ -9319,17 +9215,21 @@ "url": "https://github.com/sponsors/isaacs" } }, - "node_modules/glob-parent": { - "version": "5.1.2", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", - "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "node_modules/glob-to-regex.js": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/glob-to-regex.js/-/glob-to-regex.js-1.2.0.tgz", + "integrity": "sha512-QMwlOQKU/IzqMUOAZWubUOT8Qft+Y0KQWnX9nK3ch0CJg0tTp4TvGZsTfudYKv2NzoQSyPcnA6TYeIQ3jGichQ==", "dev": true, - "license": "ISC", - "dependencies": { - "is-glob": "^4.0.1" - }, + "license": "Apache-2.0", "engines": { - "node": ">= 6" + "node": ">=10.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/streamich" + }, + "peerDependencies": { + "tslib": "2" } }, "node_modules/global-directory": { @@ -9365,53 +9265,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/globby": { - "version": "14.1.0", - "resolved": "https://registry.npmjs.org/globby/-/globby-14.1.0.tgz", - "integrity": "sha512-0Ia46fDOaT7k4og1PDW4YbodWWr3scS2vAr2lTbsplOt2WkKp0vQbkI9wKis/T5LV/dqPjO3bpS/z6GTJB82LA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@sindresorhus/merge-streams": "^2.1.0", - "fast-glob": "^3.3.3", - "ignore": "^7.0.3", - "path-type": "^6.0.0", - "slash": "^5.1.0", - "unicorn-magic": "^0.3.0" - }, - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/globby/node_modules/path-type": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/path-type/-/path-type-6.0.0.tgz", - "integrity": "sha512-Vj7sf++t5pBD637NSfkxpHSMfWaeig5+DKWLhcqIYx6mWQz5hdJTGDVMQiJcw1ZYkhs7AazKDGpRVji1LJCZUQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/globby/node_modules/unicorn-magic": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/unicorn-magic/-/unicorn-magic-0.3.0.tgz", - "integrity": "sha512-+QBBXBCvifc56fsbuxZQ6Sic3wqqc3WWaqxs58gvJrcOuN83HGTCwz3oS5phzU9LthRNE9VrJCFCLUgHeeFnfA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/google-auth-library": { "version": "10.2.1", "resolved": "https://registry.npmjs.org/google-auth-library/-/google-auth-library-10.2.1.tgz", @@ -9645,13 +9498,13 @@ } }, "node_modules/header-generator": { - "version": "2.1.70", - "resolved": "https://registry.npmjs.org/header-generator/-/header-generator-2.1.70.tgz", - "integrity": "sha512-s2/jN4hIr/pDRZhXA1D2T72eO4f8Gi1mwYEIFLbU+OR7cjo+Tayrw4RlTN3dXPahrU/MBdjk9gv//MwxLoCpGQ==", + "version": "2.1.76", + "resolved": "https://registry.npmjs.org/header-generator/-/header-generator-2.1.76.tgz", + "integrity": "sha512-Lqk4zU/MIHkm29Sfle6E3Jo2gUoscoG9x12jDt1RbH3kRq/RN+NRSoRRYggmkI0GQSS0wiOIfWwjgIRrA9nHqA==", "license": "Apache-2.0", "dependencies": { "browserslist": "^4.21.1", - "generative-bayesian-network": "^2.1.70", + "generative-bayesian-network": "^2.1.76", "ow": "^0.28.1", "tslib": "^2.4.0" }, @@ -9670,13 +9523,13 @@ } }, "node_modules/hook-std": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/hook-std/-/hook-std-3.0.0.tgz", - "integrity": "sha512-jHRQzjSDzMtFy34AGj1DN+vq54WVuhSvKgrHf0OMiFQTwDD4L/qqofVEWjLOBMTn5+lCD3fPg32W9yOfnEJTTw==", + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/hook-std/-/hook-std-4.0.0.tgz", + "integrity": "sha512-IHI4bEVOt3vRUDJ+bFA9VUJlo7SzvFARPNLw75pqSmAOP2HmTWfFJtPvLBrDrlgjEYXY9zs7SFdHPQaJShkSCQ==", "dev": true, "license": "MIT", "engines": { - "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + "node": ">=20" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" @@ -9873,16 +9726,6 @@ ], "license": "BSD-3-Clause" }, - "node_modules/ignore": { - "version": "7.0.5", - "resolved": "https://registry.npmjs.org/ignore/-/ignore-7.0.5.tgz", - "integrity": "sha512-Hs59xBNfUIunMFgWAbGX5cq6893IbWg4KnrjbYwX3tx0ztorVgTDA6B2sxf8ejHJ4wz8BqGUMYlnzNBer5NvGg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 4" - } - }, "node_modules/import-fresh": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz", @@ -10166,16 +10009,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/is-finalizationregistry": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/is-finalizationregistry/-/is-finalizationregistry-1.1.1.tgz", @@ -10193,13 +10026,16 @@ } }, "node_modules/is-fullwidth-code-point": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-4.0.0.tgz", - "integrity": "sha512-O4L094N2/dZ7xqVdrXhh9r1KODPJpFms8B5sGdJLPy664AgvXsreZUyCQQNItZRDlYug4xStLjNp/sz3HvBowQ==", + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-5.1.0.tgz", + "integrity": "sha512-5XHYaSyiqADb4RnZ1Bdad6cPp8Toise4TzEjcOYDHZkTCbKgiUl7WTUCpNWHuxmDt91wnsZBc9xinNzopv3JMQ==", "dev": true, "license": "MIT", + "dependencies": { + "get-east-asian-width": "^1.3.1" + }, "engines": { - "node": ">=12" + "node": ">=18" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" @@ -10224,19 +10060,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/is-glob": { - "version": "4.0.3", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", - "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==", - "dev": true, - "license": "MIT", - "dependencies": { - "is-extglob": "^2.1.1" - }, - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/is-map": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/is-map/-/is-map-2.0.3.tgz", @@ -10588,9 +10411,9 @@ } }, "node_modules/jiti": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/jiti/-/jiti-2.5.1.tgz", - "integrity": "sha512-twQoecYPiVA5K/h6SxtORw/Bs3ar+mLUtoPSc7iMXzQzK8d7eJ/R09wmTwAjiamETn1cXYPGfNnu7DMoHgu12w==", + "version": "2.6.1", + "resolved": "https://registry.npmjs.org/jiti/-/jiti-2.6.1.tgz", + "integrity": "sha512-ekilCSN1jwRvIbgeg/57YFh8qQDNbwDb9xT/qu2DAHbFFZUicIl4ygVaAvzveMhMVr3LnpSKTNnwt8PoOfmKhQ==", "dev": true, "license": "MIT", "bin": { @@ -10605,9 +10428,9 @@ "license": "MIT" }, "node_modules/jose": { - "version": "6.0.12", - "resolved": "https://registry.npmjs.org/jose/-/jose-6.0.12.tgz", - "integrity": "sha512-T8xypXs8CpmiIi78k0E+Lk7T2zlK4zDyg+o1CZ4AkOHgDg98ogdP2BeZ61lTFKFyoEwJ9RgAgN+SdM3iPgNonQ==", + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/jose/-/jose-6.1.0.tgz", + "integrity": "sha512-TTQJyoEoKcC1lscpVDCSsVgYzUDg/0Bt3WE//WiTPK6uOCQC2KZS4MpugbMWt/zyjkopgZoXhZuCi00gLudfUA==", "license": "MIT", "funding": { "url": "https://github.com/sponsors/panva" @@ -11048,9 +10871,9 @@ "license": "MIT" }, "node_modules/lightningcss": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss/-/lightningcss-1.30.1.tgz", - "integrity": "sha512-xi6IyHML+c9+Q3W0S4fCQJOym42pyurFiJUHEcEyHS0CeKzia4yZDEsLlqOFykxOdHpNy0NmvVO31vcSqAxJCg==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss/-/lightningcss-1.30.2.tgz", + "integrity": "sha512-utfs7Pr5uJyyvDETitgsaqSyjCb2qNRAtuqUeWIAKztsOYdcACf2KtARYXg2pSvhkt+9NfoaNY7fxjl6nuMjIQ==", "dev": true, "license": "MPL-2.0", "dependencies": { @@ -11064,22 +10887,44 @@ "url": "https://opencollective.com/parcel" }, "optionalDependencies": { - "lightningcss-darwin-arm64": "1.30.1", - "lightningcss-darwin-x64": "1.30.1", - "lightningcss-freebsd-x64": "1.30.1", - "lightningcss-linux-arm-gnueabihf": "1.30.1", - "lightningcss-linux-arm64-gnu": "1.30.1", - "lightningcss-linux-arm64-musl": "1.30.1", - "lightningcss-linux-x64-gnu": "1.30.1", - "lightningcss-linux-x64-musl": "1.30.1", - "lightningcss-win32-arm64-msvc": "1.30.1", - "lightningcss-win32-x64-msvc": "1.30.1" + "lightningcss-android-arm64": "1.30.2", + "lightningcss-darwin-arm64": "1.30.2", + "lightningcss-darwin-x64": "1.30.2", + "lightningcss-freebsd-x64": "1.30.2", + "lightningcss-linux-arm-gnueabihf": "1.30.2", + "lightningcss-linux-arm64-gnu": "1.30.2", + "lightningcss-linux-arm64-musl": "1.30.2", + "lightningcss-linux-x64-gnu": "1.30.2", + "lightningcss-linux-x64-musl": "1.30.2", + "lightningcss-win32-arm64-msvc": "1.30.2", + "lightningcss-win32-x64-msvc": "1.30.2" + } + }, + "node_modules/lightningcss-android-arm64": { + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-android-arm64/-/lightningcss-android-arm64-1.30.2.tgz", + "integrity": "sha512-BH9sEdOCahSgmkVhBLeU7Hc9DWeZ1Eb6wNS6Da8igvUwAe0sqROHddIlvU06q3WyXVEOYDZ6ykBZQnjTbmo4+A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MPL-2.0", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">= 12.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" } }, "node_modules/lightningcss-darwin-arm64": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-darwin-arm64/-/lightningcss-darwin-arm64-1.30.1.tgz", - "integrity": "sha512-c8JK7hyE65X1MHMN+Viq9n11RRC7hgin3HhYKhrMyaXflk5GVplZ60IxyoVtzILeKr+xAJwg6zK6sjTBJ0FKYQ==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-darwin-arm64/-/lightningcss-darwin-arm64-1.30.2.tgz", + "integrity": "sha512-ylTcDJBN3Hp21TdhRT5zBOIi73P6/W0qwvlFEk22fkdXchtNTOU4Qc37SkzV+EKYxLouZ6M4LG9NfZ1qkhhBWA==", "cpu": [ "arm64" ], @@ -11098,9 +10943,9 @@ } }, "node_modules/lightningcss-darwin-x64": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-darwin-x64/-/lightningcss-darwin-x64-1.30.1.tgz", - "integrity": "sha512-k1EvjakfumAQoTfcXUcHQZhSpLlkAuEkdMBsI/ivWw9hL+7FtilQc0Cy3hrx0AAQrVtQAbMI7YjCgYgvn37PzA==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-darwin-x64/-/lightningcss-darwin-x64-1.30.2.tgz", + "integrity": "sha512-oBZgKchomuDYxr7ilwLcyms6BCyLn0z8J0+ZZmfpjwg9fRVZIR5/GMXd7r9RH94iDhld3UmSjBM6nXWM2TfZTQ==", "cpu": [ "x64" ], @@ -11119,9 +10964,9 @@ } }, "node_modules/lightningcss-freebsd-x64": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-freebsd-x64/-/lightningcss-freebsd-x64-1.30.1.tgz", - "integrity": "sha512-kmW6UGCGg2PcyUE59K5r0kWfKPAVy4SltVeut+umLCFoJ53RdCUWxcRDzO1eTaxf/7Q2H7LTquFHPL5R+Gjyig==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-freebsd-x64/-/lightningcss-freebsd-x64-1.30.2.tgz", + "integrity": "sha512-c2bH6xTrf4BDpK8MoGG4Bd6zAMZDAXS569UxCAGcA7IKbHNMlhGQ89eRmvpIUGfKWNVdbhSbkQaWhEoMGmGslA==", "cpu": [ "x64" ], @@ -11140,9 +10985,9 @@ } }, "node_modules/lightningcss-linux-arm-gnueabihf": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-linux-arm-gnueabihf/-/lightningcss-linux-arm-gnueabihf-1.30.1.tgz", - "integrity": "sha512-MjxUShl1v8pit+6D/zSPq9S9dQ2NPFSQwGvxBCYaBYLPlCWuPh9/t1MRS8iUaR8i+a6w7aps+B4N0S1TYP/R+Q==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-linux-arm-gnueabihf/-/lightningcss-linux-arm-gnueabihf-1.30.2.tgz", + "integrity": "sha512-eVdpxh4wYcm0PofJIZVuYuLiqBIakQ9uFZmipf6LF/HRj5Bgm0eb3qL/mr1smyXIS1twwOxNWndd8z0E374hiA==", "cpu": [ "arm" ], @@ -11161,9 +11006,9 @@ } }, "node_modules/lightningcss-linux-arm64-gnu": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-linux-arm64-gnu/-/lightningcss-linux-arm64-gnu-1.30.1.tgz", - "integrity": "sha512-gB72maP8rmrKsnKYy8XUuXi/4OctJiuQjcuqWNlJQ6jZiWqtPvqFziskH3hnajfvKB27ynbVCucKSm2rkQp4Bw==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-linux-arm64-gnu/-/lightningcss-linux-arm64-gnu-1.30.2.tgz", + "integrity": "sha512-UK65WJAbwIJbiBFXpxrbTNArtfuznvxAJw4Q2ZGlU8kPeDIWEX1dg3rn2veBVUylA2Ezg89ktszWbaQnxD/e3A==", "cpu": [ "arm64" ], @@ -11182,9 +11027,9 @@ } }, "node_modules/lightningcss-linux-arm64-musl": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-linux-arm64-musl/-/lightningcss-linux-arm64-musl-1.30.1.tgz", - "integrity": "sha512-jmUQVx4331m6LIX+0wUhBbmMX7TCfjF5FoOH6SD1CttzuYlGNVpA7QnrmLxrsub43ClTINfGSYyHe2HWeLl5CQ==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-linux-arm64-musl/-/lightningcss-linux-arm64-musl-1.30.2.tgz", + "integrity": "sha512-5Vh9dGeblpTxWHpOx8iauV02popZDsCYMPIgiuw97OJ5uaDsL86cnqSFs5LZkG3ghHoX5isLgWzMs+eD1YzrnA==", "cpu": [ "arm64" ], @@ -11203,9 +11048,9 @@ } }, "node_modules/lightningcss-linux-x64-gnu": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-linux-x64-gnu/-/lightningcss-linux-x64-gnu-1.30.1.tgz", - "integrity": "sha512-piWx3z4wN8J8z3+O5kO74+yr6ze/dKmPnI7vLqfSqI8bccaTGY5xiSGVIJBDd5K5BHlvVLpUB3S2YCfelyJ1bw==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-linux-x64-gnu/-/lightningcss-linux-x64-gnu-1.30.2.tgz", + "integrity": "sha512-Cfd46gdmj1vQ+lR6VRTTadNHu6ALuw2pKR9lYq4FnhvgBc4zWY1EtZcAc6EffShbb1MFrIPfLDXD6Xprbnni4w==", "cpu": [ "x64" ], @@ -11224,9 +11069,9 @@ } }, "node_modules/lightningcss-linux-x64-musl": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-linux-x64-musl/-/lightningcss-linux-x64-musl-1.30.1.tgz", - "integrity": "sha512-rRomAK7eIkL+tHY0YPxbc5Dra2gXlI63HL+v1Pdi1a3sC+tJTcFrHX+E86sulgAXeI7rSzDYhPSeHHjqFhqfeQ==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-linux-x64-musl/-/lightningcss-linux-x64-musl-1.30.2.tgz", + "integrity": "sha512-XJaLUUFXb6/QG2lGIW6aIk6jKdtjtcffUT0NKvIqhSBY3hh9Ch+1LCeH80dR9q9LBjG3ewbDjnumefsLsP6aiA==", "cpu": [ "x64" ], @@ -11245,9 +11090,9 @@ } }, "node_modules/lightningcss-win32-arm64-msvc": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-win32-arm64-msvc/-/lightningcss-win32-arm64-msvc-1.30.1.tgz", - "integrity": "sha512-mSL4rqPi4iXq5YVqzSsJgMVFENoa4nGTT/GjO2c0Yl9OuQfPsIfncvLrEW6RbbB24WtZ3xP/2CCmI3tNkNV4oA==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-win32-arm64-msvc/-/lightningcss-win32-arm64-msvc-1.30.2.tgz", + "integrity": "sha512-FZn+vaj7zLv//D/192WFFVA0RgHawIcHqLX9xuWiQt7P0PtdFEVaxgF9rjM/IRYHQXNnk61/H/gb2Ei+kUQ4xQ==", "cpu": [ "arm64" ], @@ -11266,9 +11111,9 @@ } }, "node_modules/lightningcss-win32-x64-msvc": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-win32-x64-msvc/-/lightningcss-win32-x64-msvc-1.30.1.tgz", - "integrity": "sha512-PVqXh48wh4T53F/1CCu8PIPCxLzWyCnn/9T5W1Jpmdy5h9Cwd+0YQS6/LwhHXSafuc61/xg9Lv5OrCby6a++jg==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-win32-x64-msvc/-/lightningcss-win32-x64-msvc-1.30.2.tgz", + "integrity": "sha512-5g1yc73p+iAkid5phb4oVFMB45417DkRevRbt/El/gKXJk4jid+vPFF/AXbxn05Aky8PapwzZrdJShv5C0avjw==", "cpu": [ "x64" ], @@ -11286,19 +11131,6 @@ "url": "https://opencollective.com/parcel" } }, - "node_modules/lilconfig": { - "version": "3.1.3", - "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.3.tgz", - "integrity": "sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=14" - }, - "funding": { - "url": "https://github.com/sponsors/antonk52" - } - }, "node_modules/lines-and-columns": { "version": "1.2.4", "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", @@ -11307,19 +11139,16 @@ "license": "MIT" }, "node_modules/lint-staged": { - "version": "16.1.5", - "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-16.1.5.tgz", - "integrity": "sha512-uAeQQwByI6dfV7wpt/gVqg+jAPaSp8WwOA8kKC/dv1qw14oGpnpAisY65ibGHUGDUv0rYaZ8CAJZ/1U8hUvC2A==", + "version": "16.2.6", + "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-16.2.6.tgz", + "integrity": "sha512-s1gphtDbV4bmW1eylXpVMk2u7is7YsrLl8hzrtvC70h4ByhcMLZFY01Fx05ZUDNuv1H8HO4E+e2zgejV1jVwNw==", "dev": true, "license": "MIT", "dependencies": { - "chalk": "^5.5.0", - "commander": "^14.0.0", - "debug": "^4.4.1", - "lilconfig": "^3.1.3", - "listr2": "^9.0.1", + "commander": "^14.0.1", + "listr2": "^9.0.5", "micromatch": "^4.0.8", - "nano-spawn": "^1.0.2", + "nano-spawn": "^2.0.0", "pidtree": "^0.6.0", "string-argv": "^0.3.2", "yaml": "^2.8.1" @@ -11335,13 +11164,13 @@ } }, "node_modules/listr2": { - "version": "9.0.1", - "resolved": "https://registry.npmjs.org/listr2/-/listr2-9.0.1.tgz", - "integrity": "sha512-SL0JY3DaxylDuo/MecFeiC+7pedM0zia33zl0vcjgwcq1q1FWWF1To9EIauPbl8GbMCU0R2e0uJ8bZunhYKD2g==", + "version": "9.0.5", + "resolved": "https://registry.npmjs.org/listr2/-/listr2-9.0.5.tgz", + "integrity": "sha512-ME4Fb83LgEgwNw96RKNvKV4VTLuXfoKudAmm2lP8Kk87KaMK0/Xrx/aAkMWmT8mDb+3MlFDspfbCs7adjRxA2g==", "dev": true, "license": "MIT", "dependencies": { - "cli-truncate": "^4.0.0", + "cli-truncate": "^5.0.0", "colorette": "^2.0.20", "eventemitter3": "^5.0.1", "log-update": "^6.1.0", @@ -11353,9 +11182,9 @@ } }, "node_modules/listr2/node_modules/ansi-styles": { - "version": "6.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", - "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", + "version": "6.2.3", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.3.tgz", + "integrity": "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg==", "dev": true, "license": "MIT", "engines": { @@ -11366,9 +11195,9 @@ } }, "node_modules/listr2/node_modules/emoji-regex": { - "version": "10.4.0", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.4.0.tgz", - "integrity": "sha512-EC+0oUMY1Rqm4O6LLrgjtYDvcVYTy7chDnM4Q7030tP4Kwj3u/pR6gP9ygnp2CJMK5Gq+9Q2oqmrFJAz01DXjw==", + "version": "10.6.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.6.0.tgz", + "integrity": "sha512-toUI84YS5YmxW219erniWD0CIVOo46xGKColeNQRgOzDorgBi1v4D71/OFzgD9GO2UGKIv1C3Sp8DAn0+j5w7A==", "dev": true, "license": "MIT" }, @@ -11391,9 +11220,9 @@ } }, "node_modules/listr2/node_modules/wrap-ansi": { - "version": "9.0.0", - "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-9.0.0.tgz", - "integrity": "sha512-G8ura3S+3Z2G+mkgNRq8dqaFZAuxfsxpBB8OCTGRTCtp+l/v9nbFNmCUP1BZMts3G1142MsZfn6eeUKrr4PD1Q==", + "version": "9.0.2", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-9.0.2.tgz", + "integrity": "sha512-42AtmgqjV+X1VpdOfyTGOYRi0/zsoLqtXQckTmqTeybT+BDIbM/Guxo7x3pE2vtpr1ok6xRqM9OpBe+Jyoqyww==", "dev": true, "license": "MIT", "dependencies": { @@ -11612,9 +11441,9 @@ } }, "node_modules/log-update/node_modules/ansi-styles": { - "version": "6.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", - "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", + "version": "6.2.3", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.3.tgz", + "integrity": "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg==", "dev": true, "license": "MIT", "engines": { @@ -11625,45 +11454,12 @@ } }, "node_modules/log-update/node_modules/emoji-regex": { - "version": "10.4.0", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.4.0.tgz", - "integrity": "sha512-EC+0oUMY1Rqm4O6LLrgjtYDvcVYTy7chDnM4Q7030tP4Kwj3u/pR6gP9ygnp2CJMK5Gq+9Q2oqmrFJAz01DXjw==", + "version": "10.6.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.6.0.tgz", + "integrity": "sha512-toUI84YS5YmxW219erniWD0CIVOo46xGKColeNQRgOzDorgBi1v4D71/OFzgD9GO2UGKIv1C3Sp8DAn0+j5w7A==", "dev": true, "license": "MIT" }, - "node_modules/log-update/node_modules/is-fullwidth-code-point": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-5.0.0.tgz", - "integrity": "sha512-OVa3u9kkBbw7b8Xw5F9P+D/T9X+Z4+JruYVNapTjPYZYUznQ5YfWeFkOj606XYYW8yugTfC8Pj0hYqvi4ryAhA==", - "dev": true, - "license": "MIT", - "dependencies": { - "get-east-asian-width": "^1.0.0" - }, - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/log-update/node_modules/slice-ansi": { - "version": "7.1.0", - "resolved": "https://registry.npmjs.org/slice-ansi/-/slice-ansi-7.1.0.tgz", - "integrity": "sha512-bSiSngZ/jWeX93BqeIAbImyTbEihizcwNjFoRUIY/T1wWQsfsm2Vw1agPKylXvQTU7iASGdHhyqRlqQzfz+Htg==", - "dev": true, - "license": "MIT", - "dependencies": { - "ansi-styles": "^6.2.1", - "is-fullwidth-code-point": "^5.0.0" - }, - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/chalk/slice-ansi?sponsor=1" - } - }, "node_modules/log-update/node_modules/string-width": { "version": "7.2.0", "resolved": "https://registry.npmjs.org/string-width/-/string-width-7.2.0.tgz", @@ -11683,9 +11479,9 @@ } }, "node_modules/log-update/node_modules/wrap-ansi": { - "version": "9.0.0", - "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-9.0.0.tgz", - "integrity": "sha512-G8ura3S+3Z2G+mkgNRq8dqaFZAuxfsxpBB8OCTGRTCtp+l/v9nbFNmCUP1BZMts3G1142MsZfn6eeUKrr4PD1Q==", + "version": "9.0.2", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-9.0.2.tgz", + "integrity": "sha512-42AtmgqjV+X1VpdOfyTGOYRi0/zsoLqtXQckTmqTeybT+BDIbM/Guxo7x3pE2vtpr1ok6xRqM9OpBe+Jyoqyww==", "dev": true, "license": "MIT", "dependencies": { @@ -11724,13 +11520,13 @@ "license": "ISC" }, "node_modules/magic-string": { - "version": "0.30.17", - "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.17.tgz", - "integrity": "sha512-sNPKHvyjVf7gyjwS4xGTaW/mCnF8wnjtifKBEhxfZ7E/S8tQ0rssrwGNn6q8JH/ohItJfSQp9mBtQYuTlH5QnA==", + "version": "0.30.21", + "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.21.tgz", + "integrity": "sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==", "dev": true, "license": "MIT", "dependencies": { - "@jridgewell/sourcemap-codec": "^1.5.0" + "@jridgewell/sourcemap-codec": "^1.5.5" } }, "node_modules/markdown-table": { @@ -12007,21 +11803,19 @@ } }, "node_modules/memfs": { - "version": "4.36.3", - "resolved": "https://registry.npmjs.org/memfs/-/memfs-4.36.3.tgz", - "integrity": "sha512-rZIVsNPGdZDPls/ckWhIsod2zRNsI2f2kEru0gMldkrEve+fPn7CVBTvfKLNyHQ9rZDWwzVBF8tPsZivzDPiZQ==", + "version": "4.50.0", + "resolved": "https://registry.npmjs.org/memfs/-/memfs-4.50.0.tgz", + "integrity": "sha512-N0LUYQMUA1yS5tJKmMtU9yprPm6ZIg24yr/OVv/7t6q0kKDIho4cBbXRi1XKttUmNYDYgF/q45qrKE/UhGO0CA==", "dev": true, "license": "Apache-2.0", "dependencies": { "@jsonjoy.com/json-pack": "^1.11.0", "@jsonjoy.com/util": "^1.9.0", + "glob-to-regex.js": "^1.0.1", "thingies": "^2.5.0", "tree-dump": "^1.0.3", "tslib": "^2.0.0" }, - "engines": { - "node": ">= 4.0.0" - }, "funding": { "type": "github", "url": "https://github.com/sponsors/streamich" @@ -12068,16 +11862,6 @@ "dev": true, "license": "MIT" }, - "node_modules/merge2": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz", - "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 8" - } - }, "node_modules/micromark": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/micromark/-/micromark-4.0.2.tgz", @@ -12669,9 +12453,9 @@ } }, "node_modules/mime": { - "version": "4.0.7", - "resolved": "https://registry.npmjs.org/mime/-/mime-4.0.7.tgz", - "integrity": "sha512-2OfDPL+e03E0LrXaGYOtTFIYhiuzep94NSsuhrNULq+stylcJedcHdzHtz0atMUuGwJfFYs0YL5xeC/Ca2x0eQ==", + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/mime/-/mime-4.1.0.tgz", + "integrity": "sha512-X5ju04+cAzsojXKes0B/S4tcYtFAJ6tTMuSPBEn9CPGlrWr8Fiw7qYeLT0XyH80HSoAoqWCaz+MWKh22P7G1cw==", "funding": [ "https://github.com/sponsors/broofa" ], @@ -12781,35 +12565,6 @@ "node": ">=16 || 14 >=14.17" } }, - "node_modules/minizlib": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/minizlib/-/minizlib-3.0.2.tgz", - "integrity": "sha512-oG62iEk+CYt5Xj2YqI5Xi9xWUeZhDI8jjQmC5oThVH5JGCTgIjr7ciJDzC7MBzYd//WvR1OTmP5Q38Q8ShQtVA==", - "dev": true, - "license": "MIT", - "dependencies": { - "minipass": "^7.1.2" - }, - "engines": { - "node": ">= 18" - } - }, - "node_modules/mkdirp": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-3.0.1.tgz", - "integrity": "sha512-+NsyUUAZDmo6YVHzL/stxSu3t9YS1iljliy3BSDrXJ/dkn1KYdmtZODGGjLcc9XLgVVpH4KshHB8XmZgMhaBXg==", - "dev": true, - "license": "MIT", - "bin": { - "mkdirp": "dist/cjs/src/bin.js" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, "node_modules/mkdirp-classic": { "version": "0.5.3", "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", @@ -12884,9 +12639,9 @@ } }, "node_modules/nano-spawn": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/nano-spawn/-/nano-spawn-1.0.2.tgz", - "integrity": "sha512-21t+ozMQDAL/UGgQVBbZ/xXvNO10++ZPuTmKRO8k9V3AClVRht49ahtDjfY8l1q6nSHOrE5ASfthzH3ol6R/hg==", + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/nano-spawn/-/nano-spawn-2.0.0.tgz", + "integrity": "sha512-tacvGzUY5o2D8CBh2rrwxyNojUsZNU2zjNTzKQrkgGJQTbGAfArVWXSKMBokBeeg6C7OLRGUEyoFlYbfeWQIqw==", "dev": true, "license": "MIT", "engines": { @@ -16848,12 +16603,12 @@ "license": "MIT" }, "node_modules/posthog-node": { - "version": "5.7.0", - "resolved": "https://registry.npmjs.org/posthog-node/-/posthog-node-5.7.0.tgz", - "integrity": "sha512-6J1AIZWtbr2lEbZOO2AzO/h1FPJjUZM4KWcdaL2UQw7FY8J7VNaH3NiaRockASFmglpID7zEY25gV/YwCtuXjg==", + "version": "5.11.0", + "resolved": "https://registry.npmjs.org/posthog-node/-/posthog-node-5.11.0.tgz", + "integrity": "sha512-9+gmWp/7AEryJMi0+/ywJjKQhpkmcjxf+eT030fTIIPvFTF84zeeagdZBGNC/Nh2Jc0grIAW6O1n5lxXiX3daA==", "license": "MIT", "dependencies": { - "@posthog/core": "1.0.0" + "@posthog/core": "1.5.0" }, "engines": { "node": ">=20" @@ -17042,27 +16797,6 @@ ], "license": "MIT" }, - "node_modules/queue-microtask": { - "version": "1.2.3", - "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", - "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==", - "dev": true, - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "license": "MIT" - }, "node_modules/quick-format-unescaped": { "version": "4.0.4", "resolved": "https://registry.npmjs.org/quick-format-unescaped/-/quick-format-unescaped-4.0.4.tgz", @@ -17531,30 +17265,6 @@ "integrity": "sha512-guoltQEx+9aMf2gDZ0s62EcV8lsXR+0w8915TC3ITdn2YueuNjdAYh/levpU9nFaoChh9RUS5ZdQMrKfVEN9tw==", "license": "MIT" }, - "node_modules/run-parallel": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", - "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==", - "dev": true, - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "license": "MIT", - "dependencies": { - "queue-microtask": "^1.2.2" - } - }, "node_modules/safe-array-concat": { "version": "1.1.3", "resolved": "https://registry.npmjs.org/safe-array-concat/-/safe-array-concat-1.1.3.tgz", @@ -17707,9 +17417,9 @@ "license": "BSD-3-Clause" }, "node_modules/semantic-release": { - "version": "24.2.7", - "resolved": "https://registry.npmjs.org/semantic-release/-/semantic-release-24.2.7.tgz", - "integrity": "sha512-g7RssbTAbir1k/S7uSwSVZFfFXwpomUB9Oas0+xi9KStSCmeDXcA7rNhiskjLqvUe/Evhx8fVCT16OSa34eM5g==", + "version": "24.2.9", + "resolved": "https://registry.npmjs.org/semantic-release/-/semantic-release-24.2.9.tgz", + "integrity": "sha512-phCkJ6pjDi9ANdhuF5ElS10GGdAKY6R1Pvt9lT3SFhOwM4T7QZE7MLpBDbNruUx/Q3gFD92/UOFringGipRqZA==", "dev": true, "license": "MIT", "dependencies": { @@ -17727,7 +17437,7 @@ "find-versions": "^6.0.0", "get-stream": "^6.0.0", "git-log-parser": "^1.2.0", - "hook-std": "^3.0.0", + "hook-std": "^4.0.0", "hosted-git-info": "^8.0.0", "import-from-esm": "^2.0.0", "lodash-es": "^4.17.21", @@ -17739,7 +17449,7 @@ "read-package-up": "^11.0.0", "resolve-from": "^5.0.0", "semver": "^7.3.2", - "semver-diff": "^4.0.0", + "semver-diff": "^5.0.0", "signale": "^1.2.1", "yargs": "^17.5.1" }, @@ -17982,9 +17692,9 @@ } }, "node_modules/semver": { - "version": "7.7.2", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz", - "integrity": "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==", + "version": "7.7.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz", + "integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==", "license": "ISC", "bin": { "semver": "bin/semver.js" @@ -17994,9 +17704,10 @@ } }, "node_modules/semver-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/semver-diff/-/semver-diff-4.0.0.tgz", - "integrity": "sha512-0Ju4+6A8iOnpL/Thra7dZsSlOHYAHIeMxfhWQRI1/VLcT3WDBZKKtQt/QkBOsiIN9ZpuvHE6cGZ0x4glCMmfiA==", + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/semver-diff/-/semver-diff-5.0.0.tgz", + "integrity": "sha512-0HbGtOm+S7T6NGQ/pxJSJipJvc4DK3FcRVMRkhsIwJDJ4Jcz5DQC1cPPzB5GhzyHjwttW878HaWQq46CkL3cqg==", + "deprecated": "Deprecated as the semver package now supports this built-in.", "dev": true, "license": "MIT", "dependencies": { @@ -18410,40 +18121,27 @@ "node": ">=8" } }, - "node_modules/slash": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/slash/-/slash-5.1.0.tgz", - "integrity": "sha512-ZA6oR3T/pEyuqwMgAKT0/hAv8oAXckzbkmR0UkUosQ+Mc4RxGoJkRmwHgHufaenlyAgE1Mxgpdcrf75y6XcnDg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=14.16" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/slice-ansi": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/slice-ansi/-/slice-ansi-5.0.0.tgz", - "integrity": "sha512-FC+lgizVPfie0kkhqUScwRu1O/lF6NOgJmlCgK+/LYxDCTk8sGelYaHDhFcDN+Sn3Cv+3VSa4Byeo+IMCzpMgQ==", + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/slice-ansi/-/slice-ansi-7.1.2.tgz", + "integrity": "sha512-iOBWFgUX7caIZiuutICxVgX1SdxwAVFFKwt1EvMYYec/NWO5meOJ6K5uQxhrYBdQJne4KxiqZc+KptFOWFSI9w==", "dev": true, "license": "MIT", "dependencies": { - "ansi-styles": "^6.0.0", - "is-fullwidth-code-point": "^4.0.0" + "ansi-styles": "^6.2.1", + "is-fullwidth-code-point": "^5.0.0" }, "engines": { - "node": ">=12" + "node": ">=18" }, "funding": { "url": "https://github.com/chalk/slice-ansi?sponsor=1" } }, "node_modules/slice-ansi/node_modules/ansi-styles": { - "version": "6.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", - "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", + "version": "6.2.3", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.3.tgz", + "integrity": "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg==", "dev": true, "license": "MIT", "engines": { @@ -19031,38 +18729,24 @@ "license": "MIT" }, "node_modules/tailwindcss": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.12.tgz", - "integrity": "sha512-DzFtxOi+7NsFf7DBtI3BJsynR+0Yp6etH+nRPTbpWnS2pZBaSksv/JGctNwSWzbFjp0vxSqknaUylseZqMDGrA==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.16.tgz", + "integrity": "sha512-pONL5awpaQX4LN5eiv7moSiSPd/DLDzKVRJz8Q9PgzmAdd1R4307GQS2ZpfiN7ZmekdQrfhZZiSE5jkLR4WNaA==", "dev": true, "license": "MIT" }, "node_modules/tapable": { - "version": "2.2.2", - "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.2.2.tgz", - "integrity": "sha512-Re10+NauLTMCudc7T5WLFLAwDhQ0JWdrMK+9B2M8zR5hRExKmsRDCBA7/aV/pNJFltmBFO5BAMlQFi/vq3nKOg==", + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.3.0.tgz", + "integrity": "sha512-g9ljZiwki/LfxmQADO3dEY1CbpmXT5Hm2fJ+QaGKwSXUylMybePR7/67YW7jOrrvjEgL1Fmz5kzyAjWVWLlucg==", "dev": true, "license": "MIT", "engines": { "node": ">=6" - } - }, - "node_modules/tar": { - "version": "7.4.3", - "resolved": "https://registry.npmjs.org/tar/-/tar-7.4.3.tgz", - "integrity": "sha512-5S7Va8hKfV7W5U6g3aYxXmlPoZVAwUMy9AOKyF2fVuZa2UD3qZjg578OrLRt8PcNN1PleVaL/5/yYATNL0ICUw==", - "dev": true, - "license": "ISC", - "dependencies": { - "@isaacs/fs-minipass": "^4.0.0", - "chownr": "^3.0.0", - "minipass": "^7.1.2", - "minizlib": "^3.0.1", - "mkdirp": "^3.0.1", - "yallist": "^5.0.0" }, - "engines": { - "node": ">=18" + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" } }, "node_modules/tar-fs": { @@ -19542,9 +19226,9 @@ } }, "node_modules/turndown": { - "version": "7.2.1", - "resolved": "https://registry.npmjs.org/turndown/-/turndown-7.2.1.tgz", - "integrity": "sha512-7YiPJw6rLClQL3oUKN3KgMaXeJJ2lAyZItclgKDurqnH61so4k4IH/qwmMva0zpuJc/FhRExBBnk7EbeFANlgQ==", + "version": "7.2.2", + "resolved": "https://registry.npmjs.org/turndown/-/turndown-7.2.2.tgz", + "integrity": "sha512-1F7db8BiExOKxjSMU2b7if62D/XOyQyZbPKq/nUwopfgnHlqXHqQ0lvfUTeUIr1lZJzOPFn43dODyMSIfvWRKQ==", "license": "MIT", "dependencies": { "@mixmark-io/domino": "^2.2.0" @@ -19656,9 +19340,9 @@ } }, "node_modules/typescript": { - "version": "5.9.2", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.2.tgz", - "integrity": "sha512-CWBzXQrc/qOkhidw1OzBTQuYRbfyxDXJMVJ1XNwUHGROVmuaeiEm3OslpZ1RV96d7SKKjZKrSJu3+t/xlw3R9A==", + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", "license": "Apache-2.0", "bin": { "tsc": "bin/tsc", @@ -20600,16 +20284,6 @@ "node": ">=10" } }, - "node_modules/yallist": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz", - "integrity": "sha512-YgvUTfwqyc7UXVMrB+SImsVYSmTS8X/tSrtdNZMImM+n7+QTriRXyXim0mBrTXNeqzVF0KWGgHPeiyViFFrNDw==", - "dev": true, - "license": "BlueOak-1.0.0", - "engines": { - "node": ">=18" - } - }, "node_modules/yaml": { "version": "2.8.1", "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.8.1.tgz", @@ -20726,9 +20400,9 @@ } }, "node_modules/zod": { - "version": "4.0.17", - "resolved": "https://registry.npmjs.org/zod/-/zod-4.0.17.tgz", - "integrity": "sha512-1PHjlYRevNxxdy2JZ8JcNAw7rX8V9P1AKkP+x/xZfxB0K5FYfuV+Ug6P/6NVSR2jHQ+FzDDoDHS04nYUsOIyLQ==", + "version": "4.1.12", + "resolved": "https://registry.npmjs.org/zod/-/zod-4.1.12.tgz", + "integrity": "sha512-JInaHOamG8pt5+Ey8kGmdcAcg3OL9reK8ltczgHTAwNhMys/6ThXHityHxVV2p3fkw/c+MAvBHFVYHFZDmjMCQ==", "license": "MIT", "funding": { "url": "https://github.com/sponsors/colinhacks" diff --git a/package.json b/package.json index 0beb7706..8c3fb58d 100644 --- a/package.json +++ b/package.json @@ -42,7 +42,7 @@ }, "dependencies": { "@fastify/formbody": "^8.0.2", - "@fastify/static": "^8.2.0", + "@fastify/static": "^8.3.0", "@joplin/turndown-plugin-gfm": "^1.0.62", "@kitajs/html": "^4.2.9", "@kitajs/ts-html-plugin": "^4.1.1", @@ -50,54 +50,54 @@ "@langchain/google-genai": "^0.2.16", "@langchain/google-vertexai": "^0.2.16", "@langchain/openai": "^0.6.3", - "@modelcontextprotocol/sdk": "^1.17.1", - "@trpc/client": "^11.4.4", + "@modelcontextprotocol/sdk": "^1.20.2", + "@trpc/client": "^11.7.1", "@trpc/server": "^11.4.4", "alpinejs": "^3.14.9", - "axios": "^1.11.0", + "axios": "^1.13.1", "axios-retry": "^4.5.0", - "better-sqlite3": "^12.2.0", + "better-sqlite3": "^12.4.1", "cheerio": "^1.1.2", "commander": "^14.0.0", - "dompurify": "^3.2.6", - "dotenv": "^17.2.1", + "dompurify": "^3.3.0", + "dotenv": "^17.2.3", "env-paths": "^3.0.0", - "fastify": "^5.4.0", + "fastify": "^5.6.1", "flowbite": "^3.1.2", "fuse.js": "^7.1.0", - "header-generator": "^2.1.69", + "header-generator": "^2.1.76", "htmx.org": "^2.0.6", "iconv-lite": "^0.6.3", - "jose": "^6.0.12", + "jose": "^6.1.0", "jsdom": "^26.1.0", "langchain": "^0.3.30", - "mime": "^4.0.7", + "mime": "^4.1.0", "minimatch": "^10.0.1", "playwright": "^1.52.0", - "posthog-node": "^5.7.0", + "posthog-node": "^5.11.0", "psl": "^1.15.0", "remark": "^15.0.1", "remark-gfm": "^4.0.1", "remark-html": "^16.0.1", - "semver": "^7.7.2", + "semver": "^7.7.3", "sqlite-vec": "^0.1.7-alpha.2", "tree-sitter": "^0.21.1", "tree-sitter-javascript": "^0.23.1", "tree-sitter-python": "^0.21.0", "tree-sitter-typescript": "^0.23.2", - "turndown": "^7.2.0", - "zod": "^4.0.14" + "turndown": "^7.2.2", + "zod": "^4.1.12" }, "devDependencies": { - "@biomejs/biome": "^2.1.3", + "@biomejs/biome": "^2.3.2", "@commitlint/cli": "^19.8.1", "@commitlint/config-conventional": "^19.8.1", "@semantic-release/changelog": "^6.0.3", "@semantic-release/git": "^10.0.1", - "@semantic-release/github": "^11.0.3", + "@semantic-release/github": "^11.0.6", "@semantic-release/npm": "^12.0.2", - "@tailwindcss/postcss": "^4.1.11", - "@tailwindcss/vite": "^4.1.11", + "@tailwindcss/postcss": "^4.1.16", + "@tailwindcss/vite": "^4.1.16", "@types/alpinejs": "^3.13.11", "@types/better-sqlite3": "^7.6.13", "@types/jsdom": "~21.1.7", @@ -105,19 +105,19 @@ "@types/node": "^24.1.0", "@types/node-fetch": "^2.6.13", "@types/psl": "^1.1.3", - "@types/semver": "^7.7.0", - "@types/turndown": "^5.0.5", + "@types/semver": "^7.7.1", + "@types/turndown": "^5.0.6", "autoprefixer": "^10.4.21", "flowbite-typography": "^1.0.5", "husky": "^9.1.7", - "lint-staged": "^16.1.2", - "memfs": "^4.34.0", + "lint-staged": "^16.2.6", + "memfs": "^4.50.0", "nock": "^14.0.10", "npm-run-all": "^4.1.5", "postcss": "^8.5.6", - "semantic-release": "^24.2.7", + "semantic-release": "^24.2.9", "tailwindcss": "^4.1.4", - "typescript": "^5.9.2", + "typescript": "^5.9.3", "vite": "^6.3.5", "vite-node": "^3.1.2", "vite-plugin-dts": "^4.5.4", From b82dc274b4de5195f8de73a749291c0be6a8fafd Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Mon, 10 Nov 2025 08:44:38 +0100 Subject: [PATCH 08/20] feat(refresh): improve depth handling and backfill logic in migrations - Update migration to backfill depth based on source_url matching instead of default 0 - Root pages (matching source_url) assigned depth 0, discovered pages depth 1 - Add comprehensive refresh architecture documentation covering conditional requests, status handling, and change detection - Fix test expectations to account for multiple documents processed during refresh - Preserve page depth during refresh operations to maintain crawl hierarchy This ensures depth values accurately reflect page discovery order and provides better context for search relevance while maintaining efficiency through HTTP conditional requests. --- db/migrations/010-add-depth-to-pages.sql | 12 +- docs/refresh-architecture.md | 525 +++++++++++++ docs/refresh-testing-prd.md | 430 +++++++++++ src/cli/commands/refresh.ts | 110 +++ src/cli/index.test.ts | 1 + src/cli/index.ts | 2 + src/mcp/mcpServer.test.ts | 3 + src/mcp/mcpServer.ts | 50 ++ src/mcp/tools.ts | 3 + src/pipeline/PipelineManager.ts | 10 +- src/pipeline/PipelineWorker.test.ts | 8 +- src/pipeline/PipelineWorker.ts | 10 +- src/scraper/ScraperRegistry.ts | 2 - src/scraper/fetcher/HttpFetcher.test.ts | 48 +- src/scraper/fetcher/HttpFetcher.ts | 10 +- src/scraper/strategies/BaseScraperStrategy.ts | 190 ++--- src/scraper/strategies/GitHubRepoProcessor.ts | 179 +++++ .../GitHubRepoScraperStrategy.test.ts | 478 ------------ .../strategies/GitHubRepoScraperStrategy.ts | 529 ------------- .../strategies/GitHubScraperStrategy.test.ts | 442 ++++++++--- .../strategies/GitHubScraperStrategy.ts | 476 +++++++++--- ...aperStrategy.ts => GitHubWikiProcessor.ts} | 90 +-- .../GitHubWikiScraperStrategy.test.ts | 698 ------------------ .../strategies/LocalFileStrategy.test.ts | 64 +- src/scraper/strategies/LocalFileStrategy.ts | 2 - .../strategies/WebScraperStrategy.test.ts | 153 ++-- src/scraper/strategies/WebScraperStrategy.ts | 10 + src/services/workerService.ts | 2 +- src/store/DocumentManagementService.test.ts | 10 +- src/store/DocumentManagementService.ts | 14 +- src/store/DocumentStore.test.ts | 40 +- src/store/DocumentStore.ts | 107 +-- src/store/types.ts | 4 +- src/tools/index.ts | 1 + 34 files changed, 2484 insertions(+), 2229 deletions(-) create mode 100644 docs/refresh-architecture.md create mode 100644 docs/refresh-testing-prd.md create mode 100644 src/cli/commands/refresh.ts create mode 100644 src/scraper/strategies/GitHubRepoProcessor.ts delete mode 100644 src/scraper/strategies/GitHubRepoScraperStrategy.test.ts delete mode 100644 src/scraper/strategies/GitHubRepoScraperStrategy.ts rename src/scraper/strategies/{GitHubWikiScraperStrategy.ts => GitHubWikiProcessor.ts} (68%) delete mode 100644 src/scraper/strategies/GitHubWikiScraperStrategy.test.ts diff --git a/db/migrations/010-add-depth-to-pages.sql b/db/migrations/010-add-depth-to-pages.sql index 0c69ec74..86a13e43 100644 --- a/db/migrations/010-add-depth-to-pages.sql +++ b/db/migrations/010-add-depth-to-pages.sql @@ -5,6 +5,12 @@ -- Add depth column to pages table ALTER TABLE pages ADD COLUMN depth INTEGER; --- Backfill existing pages with depth 0 (conservative default) --- This ensures all existing pages have a valid depth value -UPDATE pages SET depth = 0 WHERE depth IS NULL; +-- Backfill depth based on stored scraper options +-- Depth 0: Pages whose URL exactly matches the source_url in scraper_options +-- Depth 1: All other pages (discovered during crawl) +UPDATE pages SET depth = CASE + WHEN url = (SELECT source_url FROM versions WHERE versions.id = pages.version_id) + THEN 0 + ELSE 1 +END +WHERE depth IS NULL; diff --git a/docs/refresh-architecture.md b/docs/refresh-architecture.md new file mode 100644 index 00000000..d8598c6d --- /dev/null +++ b/docs/refresh-architecture.md @@ -0,0 +1,525 @@ +# Refresh Architecture + +## Overview + +The refresh system enables efficient re-indexing of previously scraped documentation by leveraging **HTTP conditional requests** and **intelligent change detection**. Instead of re-downloading and re-processing all content, refresh operations check each page for modifications and only process what has changed. + +**Key efficiency gains:** + +- 70-90% reduction in bandwidth usage for typical documentation updates +- Proportional reduction in processing time (unchanged pages skip pipeline entirely) +- Automatic detection and removal of deleted pages +- Discovery and indexing of newly added pages + +The refresh system integrates seamlessly with the existing scraping pipeline, using the same strategies, fetchers, and processors as initial indexing operations. + +--- + +## Core Mechanism: Conditional Requests + +Refresh operations rely on **ETags** (entity tags) - unique identifiers assigned by web servers to specific versions of a resource. When content changes, the ETag changes. + +### How It Works + +**Initial Scraping:** + +1. Fetch page from server +2. Extract content and links +3. Store content in database **with ETag** +4. Continue to discovered links + +**Refresh Operation:** + +1. Load existing pages from database (URL + ETag + pageId) +2. Fetch page with `If-None-Match: ` header +3. Server compares ETags and responds: + - **304 Not Modified** → Content unchanged, skip processing + - **200 OK** → Content changed, re-process through pipeline + - **404 Not Found** → Page deleted, remove from index + +This approach shifts the burden of change detection to the HTTP layer, where it's handled efficiently by web servers and CDNs. + +--- + +## Status Handling + +The system handles three HTTP response statuses during refresh: + +| Status Code | Meaning | Database Action | Pipeline Action | +| -------------------- | ----------------------------------- | ------------------------------------- | ------------------------------- | +| **304 Not Modified** | Content unchanged since last scrape | No changes (preserves existing data) | Skip pipeline, no re-processing | +| **200 OK** | Content modified or new page | Delete old chunks, insert new content | Full pipeline processing | +| **404 Not Found** | Page no longer exists | Delete all documents for this page | Skip pipeline | + +### 304 Not Modified Flow + +When a page returns 304, the system: + +1. Recognizes the page was checked successfully +2. Preserves all existing content in database (no updates) +3. Skips chunking, embedding, and indexing entirely +4. Continues to next page in queue + +This is the **fast path** that makes refresh efficient. + +### 200 OK Flow + +When a page returns 200 with new content, the system: + +1. Deletes existing document chunks for this page (by pageId) +2. Re-processes through full pipeline (HTML→Markdown, chunking, embeddings) +3. Inserts new chunks with updated embeddings +4. Updates page metadata (ETag, last_modified, title, etc.) +5. Extracts and follows new links + +This ensures modified content is always current. + +### 404 Not Found Flow + +When a page returns 404, the system: + +1. Deletes the page record AND all associated document chunks (by pageId) +2. Reports deletion via progress callback with `deleted: true` flag +3. Does not follow any links from deleted pages + +**Note:** The `deletePage()` method performs a complete deletion of the page and all its document chunks. This is a hard delete operation that immediately removes the page from search results. The CASCADE DELETE constraint in the database schema ensures all related documents are automatically removed when a page is deleted. + +--- + +## Database Schema + +### Pages Table + +The `pages` table stores page-level metadata with the following key fields: + +- **`id`**: Primary key for the page +- **`version_id`**: Foreign key to the versions table +- **`url`**: The page's URL (unique per version) +- **`title`**: Page title extracted from content +- **`etag`**: HTTP ETag header for change detection +- **`last_modified`**: HTTP Last-Modified header +- **`content_type`**: MIME type of the content +- **`depth`**: Crawl depth at which the page was discovered +- **`created_at`**: Timestamp when page was first indexed +- **`updated_at`**: Timestamp of last update (automatically maintained by triggers) + +The combination of `(version_id, url)` is unique, ensuring one page record per URL per version. + +### Documents Table + +The `documents` table stores individual content chunks: + +- **`id`**: Primary key for the chunk +- **`page_id`**: Foreign key to the pages table +- **`content`**: The text content of this chunk +- **`metadata`**: JSON containing chunk-specific metadata (level, path, types) +- **`sort_order`**: Order of this chunk within the page +- **`embedding`**: Vector embedding for similarity search +- **`created_at`**: Timestamp when chunk was created + +Multiple document chunks link to a single page via `page_id`. + +--- + +## Refresh Workflow + +```mermaid +graph TD + A[Start Refresh] --> B[Load Existing Pages from DB] + B --> C[Create initialQueue with pageId + ETag + depth] + C --> D{Root URL in DB?} + D -->|No| E[Add Root URL at depth 0] + D -->|Yes| F[Root URL already in queue] + E --> G[Begin Scraping] + F --> G + + G --> H[Process Queue Item] + H --> I[Fetch with ETag] + + I --> J{HTTP Status} + J -->|304| K[Skip Processing] + J -->|200| L[Delete Old Chunks] + J -->|404| M[Delete Page & Chunks] + + K --> N[Continue to Next] + L --> O[Full Pipeline Processing] + M --> P[Report Deletion] + + O --> Q[Insert New Chunks] + Q --> R[Update Page Metadata] + R --> S[Extract Links] + + N --> T{More in Queue?} + P --> T + S --> U[Add New Links to Queue] + U --> T + T -->|Yes| H + T -->|No| V[Complete] +``` + +--- + +## Full Re-Crawl Behavior + +Despite using conditional requests, refresh operations perform a **full re-crawl** of the documentation structure. This design choice is intentional and critical for correctness. + +### Why Full Re-Crawl? + +**Link structure can change without content changing:** + +- Page A (unchanged, 304) might add a link to new Page B +- Page C might remove a link, making Page D unreachable +- Navigation menus can be updated without content changes + +**If we only followed stored pages:** + +- Newly added pages would never be discovered +- Reorganizations would break coverage +- Deleted pages might remain in index indefinitely + +### How It Works + +1. **Start from root URL** (depth 0) with ETag check +2. **Even if root returns 304**, extract its links and follow them +3. **Discover new pages** not in the database (no ETag, no pageId) +4. **Process discovered pages** through full pipeline +5. **Delete chunks for 404 pages** to remove from search + +This approach combines the efficiency of conditional requests (skip unchanged pages) with the completeness of full crawling (find new pages). + +--- + +## Link Discovery and Depth Preservation + +### Initial Queue Setup + +Refresh operations receive an `initialQueue` parameter containing all previously indexed pages: + +```typescript +initialQueue: [ + { url: "https://docs.example.com", depth: 0, pageId: 1, etag: "abc123" }, + { + url: "https://docs.example.com/guide", + depth: 1, + pageId: 2, + etag: "def456", + }, + { url: "https://docs.example.com/api", depth: 1, pageId: 3, etag: "ghi789" }, + // ... all other indexed pages +]; +``` + +The **depth value is preserved** from the original scrape. This ensures: + +- Pages respect `maxDepth` limits during refresh +- Depth-based filtering works consistently +- Progress tracking shows accurate depth information + +### New Page Discovery + +When refresh discovers a new page (not in `initialQueue`): + +1. Calculate depth based on parent page: `parent.depth + 1` +2. Assign no `pageId` (will be created during database insert) +3. Process through full pipeline as a new page + +### Root URL Handling + +The root URL is **always processed**, even if it appears in `initialQueue`: + +1. Ensures the entry point is always checked +2. Allows detection of top-level navigation changes +3. Serves as the canonical base for link resolution + +The `BaseScraperStrategy` ensures the root URL appears exactly once in the queue, either from `initialQueue` or added explicitly. + +--- + +## Strategy-Specific Behavior + +Different scraping strategies handle refresh operations differently based on their data sources: + +### WebScraperStrategy + +**ETag Source:** HTTP `ETag` header from web servers + +**Refresh Characteristics:** + +- Most efficient with modern web servers and CDNs +- Supports conditional requests natively +- Handles redirects by updating canonical URLs +- Discovers new pages through link following + +**Example Scenario:** + +``` +Initial: https://docs.example.com/v1.0/guide +After Redirect: https://docs.example.com/v2.0/guide +Action: Update canonical URL, check ETag, process if changed +``` + +### LocalFileStrategy + +**ETag Source:** File modification time (mtime) converted to ISO string + +**Refresh Characteristics:** + +- Uses filesystem metadata instead of HTTP +- Detects file modifications via mtime comparison +- Discovers new files by scanning directories +- Handles file deletions through missing file detection (ENOENT) + +**Trade-offs:** + +- mtime less granular than HTTP ETags +- Directory structures must be re-scanned fully +- No network overhead (local filesystem) + +### GitHubScraperStrategy + +**ETag Source:** Varies by content type + +**Refresh Characteristics:** + +- Wiki pages: HTTP ETags from GitHub's web interface +- Repository files: GitHub API ETags for raw content +- Mixed approach: Wiki content via web, files via raw.githubusercontent.com + +**Complex Scenarios:** + +- Root URL discovery returns both wiki URL and file URLs +- Wiki refresh follows standard web strategy +- File refresh checks individual file ETags from raw.githubusercontent.com + +**Example Flow:** + +``` +Root: https://github.com/user/repo + ↓ +Discovers: https://github.com/user/repo/wiki (returns 304 or 200) +Discovers: File URLs as HTTPS blob URLs (e.g., /blob/main/README.md) +``` + +--- + +## Database Operations + +### Update Patterns + +Refresh operations perform different database operations based on status: + +**304 Not Modified:** + +- No database changes - content and metadata remain unchanged +- Strategy simply continues to next page in queue + +**200 OK (Modified Content):** + +1. Delete old document chunks for the page +2. Update page metadata via UPSERT (title, etag, last_modified, content_type, depth) +3. Insert new document chunks +4. Update vector embeddings for new chunks + +**404 Not Found:** + +1. Delete all document chunks for the page +2. Delete the page record itself + +**New Page (200 OK, no pageId):** + +1. Insert new page record +2. Insert document chunks +3. Generate and store vector embeddings + +### Concurrency Handling + +The refresh system processes multiple pages concurrently (default: 3 workers). Database operations are: + +- **Atomic** - Each page update is a single transaction in PipelineWorker +- **Isolated** - No cross-page dependencies +- **Idempotent** - Delete + Insert pattern is safe to retry on failure + +The `visited` set in `BaseScraperStrategy` prevents duplicate processing across concurrent workers. + +--- + +## Performance Characteristics + +### Bandwidth Savings + +**Typical documentation site refresh:** + +- 70-90% of pages unchanged (return 304) +- 5-10% of pages modified (return 200) +- 1-5% of pages deleted (return 404) +- <5% of pages newly added + +**Bandwidth reduction:** + +- 304 responses: ~1KB (headers only) +- 200 responses: Full page size +- Net reduction: 70-90% compared to full re-indexing + +### Processing Time + +**Time spent per page:** + +- 304: <50ms (HTTP request + no database changes) +- 200: 500-2000ms (fetch + pipeline + chunking + embeddings) +- 404: <100ms (HTTP request + document deletion) + +**Overall speedup:** + +- Sites with few changes: 5-10x faster than re-indexing +- Sites with many changes: Approaches re-indexing time +- Sweet spot: Weekly/monthly refresh of active documentation + +### Network Efficiency + +**Request patterns:** + +- Single HTTP request per page (no redundant fetches) +- Conditional requests leverage CDN caching +- Failed requests don't retry (404 is definitive) +- Concurrent requests respect `maxConcurrency` limit + +--- + +## Design Trade-offs + +### Full Re-Crawl vs. Stored-Only Check + +**Decision:** Always re-crawl from root, even during refresh + +**Trade-off:** + +- ✅ Discovers new pages automatically +- ✅ Detects navigation changes +- ✅ Removes orphaned pages +- ❌ Requires checking all known pages (even if 304) +- ❌ Network requests for unchanged pages + +**Rationale:** Correctness over performance. The conditional request mechanism mitigates the performance cost while ensuring complete coverage. + +### Hard Deletion vs. Soft Deletion + +**Decision:** Hard delete both document chunks and page records + +**Trade-off:** + +- ✅ Deleted content immediately removed from search +- ✅ Page records completely removed, preventing database bloat +- ✅ Simple implementation (no query filtering needed) +- ✅ Clean database state with no orphaned page records +- ❌ Document chunks and page metadata cannot be recovered +- ❌ No historical tracking of deleted pages + +**Rationale:** Search accuracy is paramount. Deleted content must not appear in results. Complete deletion ensures database remains clean and doesn't accumulate empty page records over time. The page metadata loss is acceptable since deleted pages are no longer relevant to the documentation. + +### ETag Storage per Page + +**Decision:** Store ETags in pages table, not separate cache + +**Trade-off:** + +- ✅ Simple schema, no joins required +- ✅ Atomic updates (page + ETag together) +- ✅ ETag tied to content version +- ❌ Larger pages table +- ❌ ETag duplication if same content on multiple URLs + +**Rationale:** Simplicity and correctness. ETags are intrinsically tied to content versions, not URLs. + +--- + +## Testing Strategy + +Refresh behavior is tested at multiple levels: + +### Unit Tests (Strategy Level) + +Each strategy's test suite includes refresh scenarios: + +- Pages returning 304 (skip processing) +- Pages returning 200 (re-process) +- Pages returning 404 (mark deleted) +- New pages discovered during refresh +- Depth preservation from initialQueue + +**Example:** `LocalFileStrategy.test.ts` refresh workflow tests + +### Integration Tests (Pipeline Level) + +End-to-end refresh workflows: + +- Multi-page refresh with mixed statuses +- Concurrent refresh operations +- Database consistency after refresh +- Link discovery and depth handling + +**Example:** `test/refresh-pipeline-e2e.test.ts` + +### Real-World Scenarios + +Testing against actual documentation sites: + +- GitHub repositories with wiki + files +- NPM package documentation +- Local file hierarchies with modifications + +These tests validate that the refresh system handles real content structures correctly. + +--- + +## Future Enhancements + +Potential improvements to the refresh system: + +### Incremental Refresh + +Only check pages modified since last refresh based on timestamps: + +- Reduces network requests further +- Risk: Miss changes on infrequently checked pages +- Requires careful timestamp management + +### Parallel Strategy Execution + +Run multiple strategies simultaneously for multi-source documentation: + +- Example: GitHub repo files + NPM registry + official docs +- Requires coordination across strategies +- Complex dependency management + +### Smart Re-crawl Scheduling + +Adjust refresh frequency based on historical change patterns: + +- Stable pages: Check less frequently +- Volatile pages: Check more frequently +- Requires tracking change history per page + +### Webhook-Based Updates + +Trigger refresh on content update notifications: + +- GitHub webhooks for repository changes +- CMS webhooks for documentation updates +- Eliminates polling, reduces latency +- Requires webhook infrastructure + +--- + +## Summary + +The refresh architecture achieves **efficient re-indexing** through: + +1. **Conditional HTTP requests** - Let servers decide what changed +2. **Full re-crawl** - Ensure complete coverage despite conditional requests +3. **Status-based handling** - Different actions for 304/200/404 +4. **Depth preservation** - Maintain original discovery structure +5. **Unified pipeline** - Same code paths as initial scraping + +This design balances **performance** (skip unchanged content) with **correctness** (discover all changes) while maintaining **simplicity** (reuse existing infrastructure). + +Refresh is not a separate system - it's the same scraping pipeline with smarter change detection. diff --git a/docs/refresh-testing-prd.md b/docs/refresh-testing-prd.md new file mode 100644 index 00000000..dd52ac3b --- /dev/null +++ b/docs/refresh-testing-prd.md @@ -0,0 +1,430 @@ +# Refresh Testing PRD + +## Overview + +This document outlines additional test cases needed to ensure comprehensive coverage of the refresh functionality. The focus is on unit tests for specific components and edge cases not covered by existing E2E tests. + +## Existing Coverage + +The current `test/refresh-pipeline-e2e.test.ts` covers: + +- ✅ Page deletion (404 during refresh) +- ✅ Page updates (200 with new content) +- ✅ Unchanged pages (304 responses) +- ✅ New page discovery during refresh +- ✅ 404 handling during normal scraping + +## Proposed Additional Test Coverage + +### 1. BaseScraperStrategy Unit Tests + +**File:** `src/scraper/strategies/BaseScraperStrategy.test.ts` (extend existing) + +#### 1.1 Initial Queue Processing + +```typescript +describe("initialQueue handling", () => { + it("should process all items from initialQueue before discovering new links"); + it("should preserve depth from initialQueue items"); + it("should preserve pageId from initialQueue items"); + it("should preserve etag from initialQueue items"); + it("should deduplicate between initialQueue and root URL"); + it("should handle empty initialQueue gracefully"); +}); +``` + +**Rationale:** The initialQueue is critical for refresh operations but isn't thoroughly tested at the unit level. We need to verify it's properly integrated into the scraping workflow. + +#### 1.2 Refresh Mode Detection + +```typescript +describe("refresh mode detection", () => { + it("should detect refresh mode when initialQueue is provided"); + it("should use normal mode when initialQueue is empty"); + it("should correctly calculate effectiveTotal with initialQueue"); + it("should correctly track totalDiscovered with initialQueue"); +}); +``` + +**Rationale:** The strategy behaves differently in refresh mode. We should verify this detection logic works correctly. + +#### 1.3 Root URL Handling in Refresh + +```typescript +describe("root URL handling during refresh", () => { + it("should process root URL even if it appears in initialQueue"); + it("should not duplicate root URL if already in initialQueue"); + it("should use etag from initialQueue for root URL if available"); + it("should add root URL at depth 0 if not in initialQueue"); +}); +``` + +**Rationale:** Root URL handling has special logic that needs validation to ensure it's always processed exactly once. + +### 2. ProcessItem Result Status Handling + +**File:** `src/scraper/strategies/BaseScraperStrategy.test.ts` (extend existing) + +#### 2.1 Status-Based Counting + +```typescript +describe("page counting with different statuses", () => { + it("should count pages that return 200 OK"); + it("should count pages that return 304 Not Modified"); + it("should count pages that return 404 Not Found"); + it("should NOT count directory discoveries (no content, no pageId)"); + it("should increment pageCount correctly with mixed statuses"); +}); +``` + +**Rationale:** The `shouldCount` logic in `processBatch` is critical for correct progress reporting and needs explicit testing. + +#### 2.2 Progress Callback with Statuses + +```typescript +describe("progress callback with different statuses", () => { + it("should call progressCallback with result=null for 304 responses"); + it("should call progressCallback with result=null for 404 responses"); + it("should call progressCallback with deleted=true for 404 responses"); + it("should call progressCallback with full result for 200 responses"); + it("should include pageId in progress for refresh operations"); +}); +``` + +**Rationale:** Progress callbacks are how external systems track refresh progress. We need to verify they receive correct information for each status. + +### 3. ETag Handling Unit Tests + +**File:** `src/scraper/fetcher/HttpFetcher.test.ts` (extend existing) + +#### 3.1 Conditional Request Headers + +```typescript +describe("conditional request headers", () => { + it("should send If-None-Match header when etag is provided"); + it("should NOT send If-None-Match header when etag is null"); + it("should NOT send If-None-Match header when etag is undefined"); + it("should handle etag with quotes correctly"); + it("should handle etag without quotes correctly"); +}); +``` + +**Rationale:** ETag header formatting is critical for conditional requests. We need to verify it follows HTTP standards. + +#### 3.2 ETag in Response + +```typescript +describe("ETag extraction from responses", () => { + it("should extract ETag from 200 responses"); + it("should preserve ETag from 304 responses"); + it("should handle missing ETag header gracefully"); + it("should handle weak ETags (W/) correctly"); + it("should normalize ETag quotes consistently"); +}); +``` + +**Rationale:** ETag extraction must be consistent to enable proper change detection in future refreshes. + +### 4. FileFetcher ETag Tests + +**File:** `src/scraper/fetcher/FileFetcher.test.ts` (new file) + +#### 4.1 Mtime-Based ETag Generation + +```typescript +describe("mtime-based ETag generation", () => { + it("should generate ETag from file mtime"); + it("should return same ETag for unchanged files"); + it("should return different ETag when file is modified"); + it("should handle files without mtime gracefully"); + it("should generate consistent ETag format (ISO string)"); +}); +``` + +**Rationale:** FileFetcher uses mtime as ETag equivalent. This needs explicit testing to ensure it works correctly. + +#### 4.2 File Status Detection + +```typescript +describe("file status detection", () => { + it("should return SUCCESS when file exists"); + it("should return NOT_FOUND when file does not exist"); + it("should return NOT_MODIFIED when mtime matches etag"); + it("should return SUCCESS when mtime differs from etag"); + it("should handle permission errors appropriately"); +}); +``` + +**Rationale:** File status detection drives refresh logic for local files and needs thorough testing. + +### 5. PipelineWorker Refresh Logic + +**File:** `src/pipeline/PipelineWorker.test.ts` (extend existing) + +#### 5.1 Status-Based Database Operations + +```typescript +describe("database operations based on fetch status", () => { + it("should skip database operations for 304 Not Modified"); + it("should delete and re-insert for 200 OK with pageId"); + it("should insert new page for 200 OK without pageId"); + it("should call deletePage for 404 Not Found"); + it("should not process content for 404 Not Found"); +}); +``` + +**Rationale:** PipelineWorker orchestrates database operations based on status. This critical logic needs unit tests. + +#### 5.2 PageId Handling + +```typescript +describe("pageId handling during refresh", () => { + it("should use pageId from scrape result when available"); + it("should handle missing pageId for new pages"); + it("should pass pageId to removeDocumentsByPageId"); + it("should pass pageId to deletePage"); + it("should preserve pageId in progress events"); +}); +``` + +**Rationale:** PageId is the key identifier for refresh operations. We need to verify it's handled correctly throughout the pipeline. + +### 6. DocumentStore Deletion Methods + +**File:** `src/store/DocumentStore.test.ts` (extend existing) + +#### 6.1 deletePage Method + +```typescript +describe("deletePage method", () => { + it("should delete page and all associated documents via CASCADE"); + it("should return true when page exists and is deleted"); + it("should return false when page does not exist"); + it("should handle concurrent deletions gracefully"); + it("should not affect other pages in same version"); +}); +``` + +**Rationale:** The new deletePage method is critical for proper 404 handling. It needs comprehensive unit tests. + +#### 6.2 removeDocumentsByPageId Method + +```typescript +describe("removeDocumentsByPageId method", () => { + it("should remove all documents for given pageId"); + it("should return count of documents removed"); + it("should not affect documents from other pages"); + it("should handle non-existent pageId gracefully"); + it("should handle empty document set gracefully"); +}); +``` + +**Rationale:** This method is used during content updates (200 OK). We need to verify it works correctly. + +### 7. Strategy-Specific Refresh Tests + +**File:** `src/scraper/strategies/WebScraperStrategy.test.ts` (extend existing) + +#### 7.1 ETag Propagation + +```typescript +describe("ETag propagation in WebScraperStrategy", () => { + it("should pass etag from QueueItem to fetcher"); + it("should preserve etag in ProcessItemResult"); + it("should update etag when content changes (200 OK)"); + it("should preserve etag when content unchanged (304)"); + it("should clear etag for deleted pages (404)"); +}); +``` + +**Rationale:** We need to verify ETags flow correctly through the web scraping pipeline. + +#### 7.2 Refresh with Redirects + +```typescript +describe("refresh with URL redirects", () => { + it("should update canonical URL after redirect"); + it("should use new ETag after redirect"); + it("should handle redirect to same domain"); + it("should handle redirect during refresh operation"); +}); +``` + +**Rationale:** Redirects during refresh can complicate URL tracking. This needs explicit testing. + +### 8. LocalFileStrategy Refresh Tests + +**File:** `src/scraper/strategies/LocalFileStrategy.test.ts` (extend existing) + +#### 8.1 File Modification Detection + +```typescript +describe("file modification detection", () => { + it("should detect when file mtime has changed"); + it("should skip processing when mtime unchanged"); + it("should handle file deletion during refresh"); + it("should discover new files during refresh"); +}); +``` + +**Rationale:** The existing refresh tests in LocalFileStrategy are good but can be expanded with more specific mtime scenarios. + +#### 8.2 Directory Re-scanning + +```typescript +describe("directory re-scanning during refresh", () => { + it("should discover files added to directory"); + it("should detect files removed from directory"); + it("should handle nested directory changes"); + it("should preserve depth for existing files"); +}); +``` + +**Rationale:** Directory refresh requires full re-scan. We need to verify this works correctly. + +### 9. GitHubScraperStrategy Refresh Tests + +**File:** `src/scraper/strategies/GitHubScraperStrategy.test.ts` (extend existing) + +#### 9.1 Mixed Content Refresh + +```typescript +describe("mixed wiki and file refresh", () => { + it("should refresh wiki pages with HTTP ETags"); + it("should refresh repository files with API ETags"); + it("should handle wiki deletion gracefully"); + it("should discover new files added to repository"); + it("should handle tree API rate limiting"); +}); +``` + +**Rationale:** GitHub strategy handles both wiki and files. Refresh logic for both needs validation. + +### 10. Edge Cases and Error Scenarios + +**File:** `test/refresh-edge-cases-e2e.test.ts` (new file) + +#### 10.1 Network Failures During Refresh + +```typescript +describe("network failures during refresh", () => { + it("should handle timeout for single page gracefully"); + it("should continue refresh after network error"); + it("should mark job as failed after multiple errors"); + it("should preserve valid pages after partial failure"); +}); +``` + +**Rationale:** Network issues are common in production. We need to verify graceful degradation. + +#### 10.2 Database Failures During Refresh + +```typescript +describe("database failures during refresh", () => { + it("should rollback transaction on deletion failure"); + it("should handle constraint violations gracefully"); + it("should recover from temporary lock contention"); + it("should preserve database consistency on error"); +}); +``` + +**Rationale:** Database operations can fail. We need to verify error handling maintains consistency. + +#### 10.3 Concurrent Refresh Operations + +```typescript +describe("concurrent refresh operations", () => { + it("should handle concurrent refreshes of same version"); + it("should handle concurrent refreshes of different versions"); + it("should prevent duplicate processing of same URL"); + it("should maintain database consistency with concurrent writes"); +}); +``` + +**Rationale:** Production systems may trigger multiple refreshes. We need to verify concurrent safety. + +#### 10.4 Malformed ETag Handling + +```typescript +describe("malformed ETag handling", () => { + it("should handle ETag with special characters"); + it("should handle very long ETags"); + it("should handle empty ETag string"); + it("should handle ETag with invalid quotes"); + it("should fall back gracefully with malformed ETags"); +}); +``` + +**Rationale:** Real-world servers may return non-standard ETags. We need robust handling. + +## Implementation Priority + +### Phase 1: Critical Unit Tests (High Priority) + +1. **BaseScraperStrategy initialQueue handling** - Core refresh functionality +2. **PipelineWorker status-based operations** - Database consistency +3. **DocumentStore deletePage** - New method validation +4. **HttpFetcher conditional headers** - ETag correctness + +### Phase 2: Strategy-Specific Tests (Medium Priority) + +5. **WebScraperStrategy ETag propagation** - Most common use case +6. **LocalFileStrategy file modification** - File-based refresh +7. **FileFetcher status detection** - File-based change detection + +### Phase 3: Edge Cases (Lower Priority) + +8. **Network failures** - Production resilience +9. **Concurrent operations** - Scale testing +10. **Malformed data handling** - Robustness + +## Testing Approach + +### Unit Tests + +- **Isolation**: Mock external dependencies (filesystem, network, database) +- **Speed**: Should run in <100ms per test +- **Clarity**: Each test validates one specific behavior +- **Coverage**: Aim for >90% line coverage of refresh code paths + +### Integration Tests + +- **Realistic**: Use in-memory database but real HTTP mocking +- **Comprehensive**: Test full workflows end-to-end +- **Performance**: Should complete in <5 seconds per test +- **Scenarios**: Cover common real-world refresh patterns + +### E2E Tests + +- **Complete**: Use full stack including pipeline workers +- **Realistic**: Mock external services (GitHub API, web servers) +- **Validation**: Verify database state after operations +- **Time**: May take 10-30 seconds per test + +## Success Criteria + +1. **Code Coverage**: >90% line coverage for refresh-related code +2. **Test Speed**: Unit tests complete in <5 seconds total +3. **Reliability**: All tests pass consistently (no flakiness) +4. **Documentation**: Each test has clear description of what it validates +5. **Maintainability**: Tests use helpers/fixtures to reduce duplication + +## Non-Goals + +- **Performance benchmarking**: Not testing refresh speed, only correctness +- **Load testing**: Not testing high-volume refresh scenarios +- **Integration with real services**: All external services should be mocked +- **UI testing**: Refresh is a backend feature with no UI + +## Open Questions + +1. Should we test ETag generation algorithms directly, or only their behavior? +2. How do we test CASCADE DELETE without actually running migrations in tests? +3. Should we add property-based tests for ETag normalization? +4. Do we need tests for refresh cancellation mid-operation? + +## References + +- Existing E2E tests: `test/refresh-pipeline-e2e.test.ts` +- Refresh architecture: `docs/refresh-architecture.md` +- Strategy unit tests: `src/scraper/strategies/*.test.ts` diff --git a/src/cli/commands/refresh.ts b/src/cli/commands/refresh.ts new file mode 100644 index 00000000..e8ad34a4 --- /dev/null +++ b/src/cli/commands/refresh.ts @@ -0,0 +1,110 @@ +/** + * Refresh command - Re-scrapes an existing library version using ETags to skip unchanged pages. + */ + +import type { Command } from "commander"; +import { Option } from "commander"; +import type { PipelineOptions } from "../../pipeline"; +import type { IPipeline } from "../../pipeline/trpc/interfaces"; +import { createDocumentManagement } from "../../store"; +import type { IDocumentManagement } from "../../store/trpc/interfaces"; +import { analytics, TelemetryEvent } from "../../telemetry"; +import { RefreshVersionTool } from "../../tools/RefreshVersionTool"; +import { + createPipelineWithCallbacks, + getGlobalOptions, + resolveEmbeddingContext, +} from "../utils"; + +export async function refreshAction( + library: string, + options: { + version?: string; + embeddingModel?: string; + serverUrl?: string; + }, + command?: Command, +) { + await analytics.track(TelemetryEvent.CLI_COMMAND, { + command: "refresh", + library, + version: options.version, + useServerUrl: !!options.serverUrl, + }); + + const serverUrl = options.serverUrl; + const globalOptions = getGlobalOptions(command); + + // Resolve embedding configuration for local execution (refresh needs embeddings) + const embeddingConfig = resolveEmbeddingContext(options.embeddingModel); + if (!serverUrl && !embeddingConfig) { + throw new Error( + "Embedding configuration is required for local refresh operations. " + + "Please set DOCS_MCP_EMBEDDING_MODEL environment variable or use --server-url for remote execution.", + ); + } + + const docService: IDocumentManagement = await createDocumentManagement({ + serverUrl, + embeddingConfig, + storePath: globalOptions.storePath, + }); + let pipeline: IPipeline | null = null; + + try { + const pipelineOptions: PipelineOptions = { + recoverJobs: false, + concurrency: 1, + serverUrl, + }; + + pipeline = await createPipelineWithCallbacks( + serverUrl ? undefined : (docService as unknown as never), + pipelineOptions, + ); + await pipeline.start(); + const refreshTool = new RefreshVersionTool(pipeline); + + // Call the tool directly - tracking is now handled inside the tool + const result = await refreshTool.execute({ + library, + version: options.version, + waitForCompletion: true, // Always wait for completion in CLI + }); + + if ("pagesRefreshed" in result) { + console.log(`✅ Successfully refreshed ${result.pagesRefreshed} pages`); + } else { + console.log(`🚀 Refresh job started with ID: ${result.jobId}`); + } + } finally { + if (pipeline) await pipeline.stop(); + await docService.shutdown(); + } +} + +export function createRefreshCommand(program: Command): Command { + return program + .command("refresh ") + .description( + "Re-scrape an existing library version, updating only changed pages.\n\n" + + "Uses HTTP ETags to efficiently skip unchanged pages and only re-process\n" + + "content that has been modified or deleted since the last scrape.\n\n" + + "Examples:\n" + + " refresh react --version 18.0.0\n" + + " refresh mylib\n" + + "\nNote: The library and version must already be indexed. Use 'scrape' to index a new library/version.", + ) + .option("-v, --version ", "Version of the library (optional)") + .addOption( + new Option( + "--embedding-model ", + "Embedding model configuration (e.g., 'openai:text-embedding-3-small')", + ).env("DOCS_MCP_EMBEDDING_MODEL"), + ) + .option( + "--server-url ", + "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)", + ) + .action(refreshAction); +} diff --git a/src/cli/index.test.ts b/src/cli/index.test.ts index bfa0f97a..c375bddf 100644 --- a/src/cli/index.test.ts +++ b/src/cli/index.test.ts @@ -238,6 +238,7 @@ describe("CLI Command Arguments Matrix", () => { "web", "worker", "scrape", + "refresh", "search", "list", "find-version", diff --git a/src/cli/index.ts b/src/cli/index.ts index 90c9158c..8acfea81 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -16,6 +16,7 @@ import { createFetchUrlCommand } from "./commands/fetchUrl"; import { createFindVersionCommand } from "./commands/findVersion"; import { createListCommand } from "./commands/list"; import { createMcpCommand } from "./commands/mcp"; +import { createRefreshCommand } from "./commands/refresh"; import { createRemoveCommand } from "./commands/remove"; import { createScrapeCommand } from "./commands/scrape"; import { createSearchCommand } from "./commands/search"; @@ -132,6 +133,7 @@ export function createCliProgram(): Command { createWebCommand(program); createWorkerCommand(program); createScrapeCommand(program); + createRefreshCommand(program); createSearchCommand(program); createListCommand(program); createFindVersionCommand(program); diff --git a/src/mcp/mcpServer.test.ts b/src/mcp/mcpServer.test.ts index fdc16113..c9bbd877 100644 --- a/src/mcp/mcpServer.test.ts +++ b/src/mcp/mcpServer.test.ts @@ -24,6 +24,9 @@ const mockTools: McpServerTools = { scrape: { execute: vi.fn(async () => ({ jobId: "job-123" })), } as any, + refresh: { + execute: vi.fn(async () => ({ jobId: "refresh-job-123" })), + } as any, listJobs: { execute: vi.fn(async () => ({ jobs: [] })), } as any, diff --git a/src/mcp/mcpServer.ts b/src/mcp/mcpServer.ts index 8510078c..b6fff20f 100644 --- a/src/mcp/mcpServer.ts +++ b/src/mcp/mcpServer.ts @@ -115,6 +115,56 @@ export function createMcpServerInstance( } }, ); + + // Refresh version tool - suppress deep inference issues + server.tool( + "refresh_version", + "Re-scrape a previously indexed library version, updating only changed pages.", + { + library: z.string().trim().describe("Library name."), + version: z + .string() + .trim() + .optional() + .describe("Library version (optional, refreshes unversioned if omitted)."), + }, + { + title: "Refresh Library Version", + destructiveHint: false, // Only updates changed content + openWorldHint: true, // requires internet access + }, + async ({ library, version }) => { + // Track MCP tool usage + analytics.track(TelemetryEvent.TOOL_USED, { + tool: "refresh_version", + context: "mcp_server", + library, + version, + }); + + try { + // Execute refresh tool without waiting + const result = await tools.refresh.execute({ + library, + version, + waitForCompletion: false, // Don't wait for completion + }); + + // Check the type of result + if ("jobId" in result) { + // If we got a jobId back, report that + return createResponse(`🔄 Refresh job started with ID: ${result.jobId}.`); + } + // This case shouldn't happen if waitForCompletion is false, but handle defensively + return createResponse( + `Refresh finished immediately (unexpectedly) with ${result.pagesRefreshed} pages.`, + ); + } catch (error) { + // Handle errors during job enqueueing or initial setup + return createError(error); + } + }, + ); } // Search docs tool diff --git a/src/mcp/tools.ts b/src/mcp/tools.ts index 04a51bc7..5057b8fe 100644 --- a/src/mcp/tools.ts +++ b/src/mcp/tools.ts @@ -8,6 +8,7 @@ import { GetJobInfoTool, ListJobsTool, ListLibrariesTool, + RefreshVersionTool, RemoveTool, ScrapeTool, SearchTool, @@ -20,6 +21,7 @@ export interface McpServerTools { listLibraries: ListLibrariesTool; findVersion: FindVersionTool; scrape: ScrapeTool; + refresh: RefreshVersionTool; search: SearchTool; listJobs: ListJobsTool; getJobInfo: GetJobInfoTool; @@ -43,6 +45,7 @@ export async function initializeTools( listLibraries: new ListLibrariesTool(docService), findVersion: new FindVersionTool(docService), scrape: new ScrapeTool(pipeline), + refresh: new RefreshVersionTool(pipeline), search: new SearchTool(docService), listJobs: new ListJobsTool(pipeline), getJobInfo: new GetJobInfoTool(pipeline), diff --git a/src/pipeline/PipelineManager.ts b/src/pipeline/PipelineManager.ts index be65338c..eab25580 100644 --- a/src/pipeline/PipelineManager.ts +++ b/src/pipeline/PipelineManager.ts @@ -334,6 +334,13 @@ export class PipelineManager implements IPipeline { // Get all pages for this version with their ETags and depths const pages = await this.store.getPagesByVersionId(versionId); + // Debug: Log first page to see what data we're getting + if (pages.length > 0) { + logger.debug( + `Sample page data: url=${pages[0].url}, etag=${pages[0].etag}, depth=${pages[0].depth}`, + ); + } + if (pages.length === 0) { throw new Error( `No pages found for ${library}@${normalizedVersion || "unversioned"}. Use scrape_docs to index it first.`, @@ -360,10 +367,11 @@ export class PipelineManager implements IPipeline { url: storedOptions?.sourceUrl || pages[0].url, // Required but not used when initialQueue is set library, version: normalizedVersion, + ...(storedOptions?.options || {}), // Include stored options if available (spread first) + // Override with refresh-specific options (these must come after the spread) initialQueue, // Pre-populated queue with existing pages maxPages: pages.length, isRefresh: true, // Mark this as a refresh operation - ...(storedOptions?.options || {}), // Include stored options if available }; // Enqueue as a standard scrape job with the initialQueue diff --git a/src/pipeline/PipelineWorker.test.ts b/src/pipeline/PipelineWorker.test.ts index c84bab5d..068f0104 100644 --- a/src/pipeline/PipelineWorker.test.ts +++ b/src/pipeline/PipelineWorker.test.ts @@ -25,7 +25,7 @@ describe("PipelineWorker", () => { mockStore = { addScrapeResult: vi.fn().mockResolvedValue(undefined), removeAllDocuments: vi.fn().mockResolvedValue(undefined), - removeDocumentsByPageId: vi.fn().mockResolvedValue(undefined), + deletePage: vi.fn().mockResolvedValue(undefined), }; mockScraperService = { @@ -331,8 +331,8 @@ describe("PipelineWorker", () => { }, ); - // Simulate removeDocumentsByPageId failing - (mockStore.removeDocumentsByPageId as Mock).mockRejectedValue(deletionError); + // Simulate deletePage failing + (mockStore.deletePage as Mock).mockRejectedValue(deletionError); // Execute the job - should fail due to deletion error await expect(worker.executeJob(mockJob, mockCallbacks)).rejects.toThrow( @@ -342,7 +342,7 @@ describe("PipelineWorker", () => { // Verify scrape was called expect(mockScraperService.scrape).toHaveBeenCalledOnce(); // Verify deletion was attempted - expect(mockStore.removeDocumentsByPageId).toHaveBeenCalledWith(123); + expect(mockStore.deletePage).toHaveBeenCalledWith(123); // Verify onJobProgress was called expect(mockCallbacks.onJobProgress).toHaveBeenCalledOnce(); // Verify onJobError was called with the deletion error diff --git a/src/pipeline/PipelineWorker.ts b/src/pipeline/PipelineWorker.ts index dcdbb4ab..aad6d92f 100644 --- a/src/pipeline/PipelineWorker.ts +++ b/src/pipeline/PipelineWorker.ts @@ -64,13 +64,13 @@ export class PipelineWorker { // Handle deletion events (404 during refresh or broken links) if (progress.deleted && progress.pageId) { try { - await this.store.removeDocumentsByPageId(progress.pageId); + await this.store.deletePage(progress.pageId); logger.debug( - `[${jobId}] Deleted documents for page ${progress.pageId}: ${progress.currentUrl}`, + `[${jobId}] Deleted page ${progress.pageId}: ${progress.currentUrl}`, ); } catch (docError) { logger.error( - `❌ [${jobId}] Failed to delete documents for page ${progress.pageId}: ${docError}`, + `❌ [${jobId}] Failed to delete page ${progress.pageId}: ${docError}`, ); // Report the error and fail the job to ensure data integrity @@ -87,9 +87,9 @@ export class PipelineWorker { try { // For refresh operations, delete old documents before adding new ones if (progress.pageId) { - await this.store.removeDocumentsByPageId(progress.pageId); + await this.store.deletePage(progress.pageId); logger.debug( - `[${jobId}] Refreshing documents for page ${progress.pageId}: ${progress.currentUrl}`, + `[${jobId}] Refreshing page ${progress.pageId}: ${progress.currentUrl}`, ); } diff --git a/src/scraper/ScraperRegistry.ts b/src/scraper/ScraperRegistry.ts index 5d8f4846..9248deec 100644 --- a/src/scraper/ScraperRegistry.ts +++ b/src/scraper/ScraperRegistry.ts @@ -2,7 +2,6 @@ import { logger } from "../utils"; import { ScraperError } from "../utils/errors"; import { validateUrl } from "../utils/url"; import { GitHubScraperStrategy } from "./strategies/GitHubScraperStrategy"; -import { GitHubWikiScraperStrategy } from "./strategies/GitHubWikiScraperStrategy"; import { LocalFileStrategy } from "./strategies/LocalFileStrategy"; import { NpmScraperStrategy } from "./strategies/NpmScraperStrategy"; import { PyPiScraperStrategy } from "./strategies/PyPiScraperStrategy"; @@ -16,7 +15,6 @@ export class ScraperRegistry { this.strategies = [ new NpmScraperStrategy(), new PyPiScraperStrategy(), - new GitHubWikiScraperStrategy(), new GitHubScraperStrategy(), new WebScraperStrategy(), new LocalFileStrategy(), diff --git a/src/scraper/fetcher/HttpFetcher.test.ts b/src/scraper/fetcher/HttpFetcher.test.ts index c8d731c8..603249d8 100644 --- a/src/scraper/fetcher/HttpFetcher.test.ts +++ b/src/scraper/fetcher/HttpFetcher.test.ts @@ -436,20 +436,23 @@ describe("HttpFetcher", () => { await fetcher.fetch("https://example.com"); // Test behavior: verify that axios is called with required properties - expect(mockedAxios.get).toHaveBeenCalledWith("https://example.com", { - responseType: "arraybuffer", - headers: expect.objectContaining({ - "user-agent": expect.any(String), - accept: expect.any(String), - "accept-language": expect.any(String), - // Verify that our custom Accept-Encoding header is set (excluding zstd) - "Accept-Encoding": "gzip, deflate, br", + expect(mockedAxios.get).toHaveBeenCalledWith( + "https://example.com", + expect.objectContaining({ + responseType: "arraybuffer", + headers: expect.objectContaining({ + "user-agent": expect.any(String), + accept: expect.any(String), + "accept-language": expect.any(String), + // Verify that our custom Accept-Encoding header is set (excluding zstd) + "Accept-Encoding": "gzip, deflate, br", + }), + timeout: undefined, + maxRedirects: 5, + signal: undefined, + decompress: true, }), - timeout: undefined, - maxRedirects: 5, - signal: undefined, - decompress: true, - }); + ); }); it("should respect custom headers", async () => { @@ -464,14 +467,17 @@ describe("HttpFetcher", () => { await fetcher.fetch("https://example.com", { headers }); // Test behavior: verify custom headers are included - expect(mockedAxios.get).toHaveBeenCalledWith("https://example.com", { - responseType: "arraybuffer", - headers: expect.objectContaining(headers), - timeout: undefined, - maxRedirects: 5, - signal: undefined, - decompress: true, - }); + expect(mockedAxios.get).toHaveBeenCalledWith( + "https://example.com", + expect.objectContaining({ + responseType: "arraybuffer", + headers: expect.objectContaining(headers), + timeout: undefined, + maxRedirects: 5, + signal: undefined, + decompress: true, + }), + ); }); describe("redirect handling", () => { diff --git a/src/scraper/fetcher/HttpFetcher.ts b/src/scraper/fetcher/HttpFetcher.ts index 46d77f37..65652633 100644 --- a/src/scraper/fetcher/HttpFetcher.ts +++ b/src/scraper/fetcher/HttpFetcher.ts @@ -129,6 +129,9 @@ export class HttpFetcher implements ContentFetcher { // Add If-None-Match header for conditional requests if ETag is provided if (options?.etag) { headers["If-None-Match"] = options.etag; + logger.debug( + `Conditional request for ${source} with If-None-Match: ${options.etag}`, + ); } const config: AxiosRequestConfig = { @@ -154,7 +157,7 @@ export class HttpFetcher implements ContentFetcher { // Handle 304 Not Modified responses for conditional requests if (response.status === 304) { - logger.debug(`🔄 Content not modified (304): ${source}`); + logger.debug(`HTTP 304 Not Modified for ${source}`); return { content: Buffer.from(""), mimeType: "text/plain", @@ -192,6 +195,9 @@ export class HttpFetcher implements ContentFetcher { // Extract ETag header for caching const etag = response.headers.etag || response.headers.ETag; + if (etag) { + logger.debug(`Received ETag for ${source}: ${etag}`); + } // Extract Last-Modified header for caching const lastModified = response.headers["last-modified"]; @@ -222,7 +228,7 @@ export class HttpFetcher implements ContentFetcher { // Handle 404 Not Found - return special status for refresh operations if (status === 404) { - logger.debug(`❌ Resource not found (404): ${source}`); + logger.debug(`Resource not found (404): ${source}`); return { content: Buffer.from(""), mimeType: "text/plain", diff --git a/src/scraper/strategies/BaseScraperStrategy.ts b/src/scraper/strategies/BaseScraperStrategy.ts index 7aaa7f18..7c01ba40 100644 --- a/src/scraper/strategies/BaseScraperStrategy.ts +++ b/src/scraper/strategies/BaseScraperStrategy.ts @@ -50,6 +50,22 @@ export interface ProcessItemResult { } export abstract class BaseScraperStrategy implements ScraperStrategy { + /** + * Set of normalized URLs that have been marked for processing. + * + * IMPORTANT: URLs are added to this set BEFORE they are actually processed, not after. + * This prevents the same URL from being queued multiple times when discovered from different sources. + * + * Usage flow: + * 1. Initial queue setup: Root URL and initialQueue items are added to visited + * 2. During processing: When a page returns links, each link is checked against visited + * 3. In processBatch deduplication: Only links NOT in visited are added to the queue AND to visited + * + * This approach ensures: + * - No URL is processed more than once + * - No URL appears in the queue multiple times + * - Efficient deduplication across concurrent processing + */ protected visited = new Set(); protected pageCount = 0; protected totalDiscovered = 0; // Track total URLs discovered (unlimited) @@ -116,81 +132,68 @@ export abstract class BaseScraperStrategy implements ScraperStrategy { // Pass signal to processItem const result = await this.processItem(item, options, signal); - // Handle different fetch statuses - switch (result.status) { - case FetchStatus.NOT_MODIFIED: - // File/page hasn't changed, skip processing - logger.debug(`Page unchanged (304): ${item.url}`); - return []; - - case FetchStatus.NOT_FOUND: - // File/page was deleted - if (item.pageId) { - // Signal deletion to the pipeline for refresh operations - this.pageCount++; - logger.info(`Page deleted (404): ${item.url}`); - await progressCallback({ - pagesScraped: this.pageCount, - totalPages: this.effectiveTotal, - totalDiscovered: this.totalDiscovered, - currentUrl: item.url, - depth: item.depth, - maxDepth: options.maxDepth ?? DEFAULT_MAX_DEPTH, - result: null, - pageId: item.pageId, - deleted: true, - }); - } else { - logger.warn(`Page not found (404): ${item.url}`); - } - return []; + // Only count items that represent tracked pages or have actual content + // - Refresh operations (have pageId): Always count (they're tracked in DB) + // - New files with content: Count (they're being indexed) + // - Directory discovery (no pageId, no content): Don't count + const shouldCount = item.pageId !== undefined || result.content !== undefined; + + let currentPageCount = this.pageCount; + if (shouldCount) { + currentPageCount = ++this.pageCount; + + // Log progress for all counted items + logger.info( + `🌐 Scraping page ${currentPageCount}/${this.effectiveTotal} (depth ${item.depth}/${maxDepth}): ${item.url}`, + ); + } - case FetchStatus.SUCCESS: - // Continue with normal processing - break; + if (result.status === FetchStatus.NOT_MODIFIED) { + // File/page hasn't changed, skip processing but count as processed + logger.debug(`Page unchanged (304): ${item.url}`); + if (shouldCount) { + await progressCallback({ + pagesScraped: currentPageCount, + totalPages: this.effectiveTotal, + totalDiscovered: this.totalDiscovered, + currentUrl: item.url, + depth: item.depth, + maxDepth: maxDepth, + result: null, + pageId: item.pageId, + }); + } + return []; + } - default: - logger.error(`Unknown fetch status: ${result.status}`); - return []; + if (result.status === FetchStatus.NOT_FOUND) { + // File/page was deleted, count as processed + logger.debug(`Page deleted (404): ${item.url}`); + if (shouldCount) { + await progressCallback({ + pagesScraped: currentPageCount, + totalPages: this.effectiveTotal, + totalDiscovered: this.totalDiscovered, + currentUrl: item.url, + depth: item.depth, + maxDepth: maxDepth, + result: null, + pageId: item.pageId, + deleted: true, + }); + } + return []; } - // FIXME: I believe this is no longer required - // // If this is the root (depth 0) and we have processed content with a URL, check for redirects - // if (item.depth === 0 && !this.canonicalBaseUrl && result?.processed) { - // try { - // const finalUrlStr = result.processed.metadata.url as string | undefined; - // if (finalUrlStr) { - // const original = new URL(options.url); - // const finalUrlObj = new URL(finalUrlStr); - // if ( - // finalUrlObj.href !== original.href && - // (finalUrlObj.protocol === "http:" || finalUrlObj.protocol === "https:") - // ) { - // this.canonicalBaseUrl = finalUrlObj; - // logger.debug( - // `Updated scope base after redirect: ${original.href} -> ${finalUrlObj.href}`, - // ); - // } else { - // this.canonicalBaseUrl = original; - // } - // } else { - // this.canonicalBaseUrl = new URL(options.url); - // } - // } catch { - // // Ignore canonical base errors - // this.canonicalBaseUrl = new URL(options.url); - // } - // } - - // Handle successful processing + if (result.status !== FetchStatus.SUCCESS) { + logger.error(`Unknown fetch status: ${result.status}`); + return []; + } + + // Handle successful processing - report result with content if (result.content) { - this.pageCount++; - // maxDepth already resolved above - logger.info( - `🌐 Scraping page ${this.pageCount}/${this.effectiveTotal} (depth ${item.depth}/${maxDepth}): ${item.url}`, - ); await progressCallback({ - pagesScraped: this.pageCount, + pagesScraped: currentPageCount, totalPages: this.effectiveTotal, totalDiscovered: this.totalDiscovered, currentUrl: item.url, @@ -278,34 +281,45 @@ export abstract class BaseScraperStrategy implements ScraperStrategy { const isRefreshMode = initialQueue.length > 0; // Initialize queue and tracking - if (isRefreshMode) { - // Initialize from provided queue - this.totalDiscovered = initialQueue.length; - this.effectiveTotal = initialQueue.length; - - // Mark all URLs in the initial queue as visited to prevent re-discovery - for (const item of initialQueue) { - this.visited.add(normalizeUrl(item.url, this.options.urlNormalizerOptions)); - } + // Start with 1 to account for the depth 0 URL that will be processed + this.totalDiscovered = 1; + this.effectiveTotal = 1; + if (isRefreshMode) { logger.debug( `Starting refresh mode with ${initialQueue.length} pre-populated pages`, ); - } else { - // Normal scraping mode - this.totalDiscovered = 1; // Start with the initial URL (unlimited counter) - this.effectiveTotal = 1; // Start with the initial URL (limited counter) - - // Track the initial URL as visited - this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions)); } // Set up base URL and queue this.canonicalBaseUrl = new URL(options.url); let baseUrl = this.canonicalBaseUrl; - const queue: QueueItem[] = isRefreshMode - ? [...initialQueue] - : [{ url: options.url, depth: 0 } satisfies QueueItem]; + + // Initialize queue: Start with root URL or use items from initialQueue (refresh mode) + // The root URL is always processed (depth 0), but if it's in initialQueue, use that + // version to preserve etag/pageId for conditional fetching + const queue: QueueItem[] = []; + const normalizedRootUrl = normalizeUrl( + options.url, + this.options.urlNormalizerOptions, + ); + + if (isRefreshMode) { + // Add all items from initialQueue, using visited set to deduplicate + for (const item of initialQueue) { + const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions); + if (!this.visited.has(normalizedUrl)) { + this.visited.add(normalizedUrl); + queue.push(item); + } + } + } + + // If root URL wasn't in initialQueue, add it now at depth 0 + if (!this.visited.has(normalizedRootUrl)) { + this.visited.add(normalizedRootUrl); + queue.unshift({ url: options.url, depth: 0 } satisfies QueueItem); + } // Resolve optional values to defaults using temporary variables const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES; diff --git a/src/scraper/strategies/GitHubRepoProcessor.ts b/src/scraper/strategies/GitHubRepoProcessor.ts new file mode 100644 index 00000000..cc100415 --- /dev/null +++ b/src/scraper/strategies/GitHubRepoProcessor.ts @@ -0,0 +1,179 @@ +import { logger } from "../../utils/logger"; +import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; +import { HttpFetcher } from "../fetcher"; +import { FetchStatus, type RawContent } from "../fetcher/types"; +import { PipelineFactory } from "../pipelines/PipelineFactory"; +import type { ContentPipeline, PipelineResult } from "../pipelines/types"; +import type { QueueItem } from "../types"; +import { ScrapeMode, type ScraperOptions } from "../types"; +import type { ProcessItemResult } from "./BaseScraperStrategy"; + +export interface GitHubRepoInfo { + owner: string; + repo: string; + branch?: string; + subPath?: string; +} + +export interface GitHubTreeItem { + path: string; + type: "blob" | "tree"; + sha: string; + size?: number; + url: string; +} + +export interface GitHubTreeResponse { + sha: string; + url: string; + tree: GitHubTreeItem[]; + truncated: boolean; +} + +/** + * GitHubRepoProcessor handles processing individual files from GitHub repositories. + * It processes HTTPS blob URLs (https://github.com/owner/repo/blob/branch/filepath). + * + * This processor is stateless and contains the core logic from GitHubRepoScraperStrategy. + */ +export class GitHubRepoProcessor { + private readonly httpFetcher = new HttpFetcher(); + private readonly pipelines: ContentPipeline[]; + + constructor() { + this.pipelines = PipelineFactory.createStandardPipelines(); + } + + /** + * Parses an HTTPS blob URL to extract repository information. + * Format: https://github.com/owner/repo/blob/branch/filepath + */ + parseHttpsBlobUrl(url: string): GitHubRepoInfo & { filePath: string } { + const parsedUrl = new URL(url); + const segments = parsedUrl.pathname.split("/").filter(Boolean); + + // Expected format: /owner/repo/blob/branch/filepath + if (segments.length < 5 || segments[2] !== "blob") { + throw new Error( + `Invalid GitHub blob URL format. Expected: https://github.com/owner/repo/blob/branch/filepath. Got: ${url}`, + ); + } + + const owner = segments[0]; + const repo = segments[1]; + const branch = segments[3]; + const filePath = segments.slice(4).join("/"); + + return { owner, repo, branch, filePath }; + } + + /** + * Fetches the raw content of a file from GitHub. + */ + private async fetchFileContent( + repoInfo: GitHubRepoInfo, + filePath: string, + etag?: string | null, + signal?: AbortSignal, + ): Promise { + const { owner, repo, branch } = repoInfo; + const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`; + + const rawContent = await this.httpFetcher.fetch(rawUrl, { signal, etag }); + + // Override GitHub's generic 'text/plain' MIME type with file extension-based detection + const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath); + if (detectedMimeType && rawContent.mimeType === "text/plain") { + return { + ...rawContent, + mimeType: detectedMimeType, + }; + } + + return rawContent; + } + + /** + * Processes a single GitHub repository file from an HTTPS blob URL. + */ + async process( + item: QueueItem, + options: ScraperOptions, + signal?: AbortSignal, + ): Promise { + // Parse the HTTPS blob URL to extract repository info and file path + const repoInfo = this.parseHttpsBlobUrl(item.url); + const { owner, repo, branch, filePath } = repoInfo; + + // Fetch the file content from raw.githubusercontent.com + const rawContent = await this.fetchFileContent( + { owner, repo, branch }, + filePath, + item.etag, + signal, + ); + + // Return the status directly - BaseScraperStrategy handles NOT_MODIFIED and NOT_FOUND + if (rawContent.status !== FetchStatus.SUCCESS) { + return { url: item.url, links: [], status: rawContent.status }; + } + + // Process content through appropriate pipeline + let processed: PipelineResult | undefined; + + for (const pipeline of this.pipelines) { + const contentBuffer = Buffer.isBuffer(rawContent.content) + ? rawContent.content + : Buffer.from(rawContent.content); + if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) { + logger.debug( + `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`, + ); + + // Force 'fetch' mode for GitHub to avoid unnecessary Playwright usage on raw content. + // GitHub raw files (e.g., HTML files) don't have their dependencies available at the + // raw.githubusercontent.com domain, so rendering them in a browser would be broken + // and provide no additional value over direct HTML parsing with Cheerio. + const gitHubOptions = { ...options, scrapeMode: ScrapeMode.Fetch }; + + processed = await pipeline.process(rawContent, gitHubOptions, this.httpFetcher); + break; + } + } + + if (!processed) { + logger.warn( + `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`, + ); + return { url: item.url, links: [], status: FetchStatus.SUCCESS }; + } + + for (const err of processed.errors ?? []) { + logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`); + } + + // Create document with GitHub-specific metadata + const githubUrl = `https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`; + + // Use filename as fallback if title is empty or not a string + const filename = filePath.split("/").pop() || "Untitled"; + + return { + url: githubUrl, + title: processed.title?.trim() || filename || "Untitled", + etag: rawContent.etag, + lastModified: rawContent.lastModified, + contentType: rawContent.mimeType, + content: processed, + links: [], // Always return empty links array for individual files + status: FetchStatus.SUCCESS, + }; + } + + /** + * Cleanup resources used by this processor. + */ + async cleanup(): Promise { + await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close())); + } +} diff --git a/src/scraper/strategies/GitHubRepoScraperStrategy.test.ts b/src/scraper/strategies/GitHubRepoScraperStrategy.test.ts deleted file mode 100644 index 0671318e..00000000 --- a/src/scraper/strategies/GitHubRepoScraperStrategy.test.ts +++ /dev/null @@ -1,478 +0,0 @@ -import { beforeEach, describe, expect, it, vi } from "vitest"; -import { FetchStatus, HttpFetcher } from "../fetcher"; -import type { RawContent } from "../fetcher/types"; -import { HtmlPipeline } from "../pipelines/HtmlPipeline"; -import { MarkdownPipeline } from "../pipelines/MarkdownPipeline"; -import type { PipelineResult } from "../pipelines/types"; -import type { ScraperOptions } from "../types"; -import { GitHubRepoScraperStrategy } from "./GitHubRepoScraperStrategy"; - -// Mock the fetcher and pipelines -vi.mock("../fetcher"); -vi.mock("../pipelines/HtmlPipeline"); -vi.mock("../pipelines/MarkdownPipeline"); - -const mockHttpFetcher = vi.mocked(HttpFetcher); -const mockHtmlPipeline = vi.mocked(HtmlPipeline); -const mockMarkdownPipeline = vi.mocked(MarkdownPipeline); - -describe("GitHubRepoScraperStrategy", () => { - let strategy: GitHubRepoScraperStrategy; - let httpFetcherInstance: any; - let htmlPipelineInstance: any; - let markdownPipelineInstance: any; - - beforeEach(() => { - vi.clearAllMocks(); - - // Setup fetcher mock - httpFetcherInstance = { - fetch: vi.fn(), - }; - mockHttpFetcher.mockImplementation(() => httpFetcherInstance); - - // Setup pipeline mocks - htmlPipelineInstance = { - canProcess: vi.fn(), - process: vi.fn(), - close: vi.fn(), - }; - markdownPipelineInstance = { - canProcess: vi.fn(), - process: vi.fn(), - close: vi.fn(), - }; - mockHtmlPipeline.mockImplementation(() => htmlPipelineInstance); - mockMarkdownPipeline.mockImplementation(() => markdownPipelineInstance); - - strategy = new GitHubRepoScraperStrategy(); - }); - - describe("canHandle", () => { - it("should handle GitHub URLs", () => { - expect(strategy.canHandle("https://github.com/owner/repo")).toBe(true); - expect(strategy.canHandle("https://www.github.com/owner/repo")).toBe(true); - expect(strategy.canHandle("https://github.com/owner/repo/tree/main")).toBe(true); - expect( - strategy.canHandle("https://github.com/owner/repo/blob/main/README.md"), - ).toBe(true); - }); - - it("should not handle non-GitHub URLs", () => { - expect(strategy.canHandle("https://gitlab.com/owner/repo")).toBe(false); - expect(strategy.canHandle("https://bitbucket.org/owner/repo")).toBe(false); - expect(strategy.canHandle("https://example.com")).toBe(false); - }); - }); - - describe("parseGitHubUrl", () => { - it("should parse basic repository URL", () => { - const result = strategy.parseGitHubUrl("https://github.com/owner/repo"); - expect(result).toEqual({ owner: "owner", repo: "repo" }); - }); - - it("should parse tree URL with branch", () => { - const result = strategy.parseGitHubUrl("https://github.com/owner/repo/tree/main"); - expect(result).toEqual({ owner: "owner", repo: "repo", branch: "main" }); - }); - - it("should parse tree URL with branch and subpath", () => { - const result = strategy.parseGitHubUrl( - "https://github.com/owner/repo/tree/main/docs", - ); - expect(result).toEqual({ - owner: "owner", - repo: "repo", - branch: "main", - subPath: "docs", - }); - }); - - it("should parse blob URL", () => { - const result = strategy.parseGitHubUrl( - "https://github.com/owner/repo/blob/main/README.md", - ); - expect(result).toEqual({ - owner: "owner", - repo: "repo", - branch: "main", - filePath: "README.md", - isBlob: true, - }); - }); - - it("should parse blob URL without file path", () => { - const result = strategy.parseGitHubUrl("https://github.com/owner/repo/blob/main"); - expect(result).toEqual({ - owner: "owner", - repo: "repo", - branch: "main", - filePath: undefined, - isBlob: true, - }); - }); - - it("should throw error for invalid repository URL", () => { - expect(() => { - strategy.parseGitHubUrl("https://github.com/invalid"); - }).toThrow("Invalid GitHub repository URL"); - }); - }); - - describe("processItem", () => { - const options: ScraperOptions = { - url: "https://github.com/owner/repo", - library: "test-lib", - version: "1.0.0", - }; - - beforeEach(() => { - // Mock repository info response - httpFetcherInstance.fetch.mockImplementation((url: string) => { - if (url.includes("api.github.com/repos")) { - return Promise.resolve({ - textContent: JSON.stringify({ default_branch: "main" }), - mimeType: "application/json", - source: url, - charset: "utf-8", - }); - } - if (url.includes("git/trees")) { - return Promise.resolve({ - textContent: JSON.stringify({ - sha: "tree123", - url: "https://api.github.com/repos/owner/repo/git/trees/tree123", - tree: [ - { - path: "README.md", - type: "blob", - sha: "abc123", - size: 1024, - url: "https://api.github.com/repos/owner/repo/git/blobs/abc123", - }, - { - path: "src/index.js", - type: "blob", - sha: "def456", - size: 512, - url: "https://api.github.com/repos/owner/repo/git/blobs/def456", - }, - { - path: "binary-file.png", - type: "blob", - sha: "ghi789", - size: 2048, - url: "https://api.github.com/repos/owner/repo/git/blobs/ghi789", - }, - ], - truncated: false, - }), - mimeType: "application/json", - source: url, - charset: "utf-8", - }); - } - return Promise.resolve({ - textContent: "file content", - mimeType: "text/plain", - source: url, - charset: "utf-8", - }); - }); - }); - - it("should discover repository structure and return file links", async () => { - const item = { url: "https://github.com/owner/repo", depth: 0 }; - - // Mock the fetchRepositoryTree method directly since it's a complex interaction - const mockFetchRepositoryTree = vi - .spyOn(strategy as any, "fetchRepositoryTree") - .mockResolvedValue({ - tree: { - sha: "tree123", - url: "https://api.github.com/repos/owner/repo/git/trees/tree123", - tree: [ - { - path: "README.md", - type: "blob", - sha: "abc123", - size: 1024, - url: "https://api.github.com/repos/owner/repo/git/blobs/abc123", - }, - { - path: "src/index.js", - type: "blob", - sha: "def456", - size: 512, - url: "https://api.github.com/repos/owner/repo/git/blobs/def456", - }, - { - path: "binary-file.png", - type: "blob", - sha: "ghi789", - size: 2048, - url: "https://api.github.com/repos/owner/repo/git/blobs/ghi789", - }, - ], - truncated: false, - }, - resolvedBranch: "main", - }); - - const result = await strategy.processItem(item, options); - - expect(result.links).toEqual([ - "github-file://README.md", - "github-file://src/index.js", - ]); - expect(result.content).toBeUndefined(); - - // Clean up the spy - mockFetchRepositoryTree.mockRestore(); - }); - - it("should handle blob URL with file path", async () => { - const blobOptions = { - ...options, - url: "https://github.com/owner/repo/blob/main/README.md", - }; - const item = { url: "https://github.com/owner/repo/blob/main/README.md", depth: 0 }; - const result = await strategy.processItem(item, blobOptions); - - expect(result.links).toEqual(["github-file://README.md"]); - expect(result.content).toBeUndefined(); - }); - - it("should handle blob URL without file path", async () => { - const blobOptions = { - ...options, - url: "https://github.com/owner/repo/blob/main", - }; - const item = { url: "https://github.com/owner/repo/blob/main", depth: 0 }; - const result = await strategy.processItem(item, blobOptions); - - expect(result.links).toEqual([]); - expect(result.content).toBeUndefined(); - }); - - it("should process individual file content", async () => { - const rawContent: RawContent = { - content: "# Test File\nThis is a test markdown file.", - mimeType: "text/markdown", - source: "https://raw.githubusercontent.com/owner/repo/main/README.md", - charset: "utf-8", - status: FetchStatus.SUCCESS, - }; - - const processedContent: PipelineResult = { - textContent: "Test File\nThis is a test markdown file.", - title: "Test File", - chunks: [], - errors: [], - links: [], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - markdownPipelineInstance.canProcess.mockReturnValue(true); - markdownPipelineInstance.process.mockResolvedValue(processedContent); - - const item = { url: "github-file://README.md", depth: 1 }; - const result = await strategy.processItem(item, options); - - expect(result.content?.textContent).toBe( - "Test File\nThis is a test markdown file.", - ); - expect(result.contentType).toBe("text/markdown"); - expect(result.url).toBe("https://github.com/owner/repo/blob/main/README.md"); - expect(result.content?.title).toBe("Test File"); - expect(result.links).toEqual([]); - }); - - it("should use filename as title fallback when no title found", async () => { - const rawContent: RawContent = { - content: "Some content without title", - mimeType: "text/plain", - source: "https://raw.githubusercontent.com/owner/repo/main/config.txt", - charset: "utf-8", - status: FetchStatus.SUCCESS, - }; - - const processedContent: PipelineResult = { - textContent: "Some content without title", - title: "", - chunks: [], - errors: [], - links: [], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - markdownPipelineInstance.canProcess.mockReturnValue(true); - markdownPipelineInstance.process.mockResolvedValue(processedContent); - - const item = { url: "github-file://config.txt", depth: 1 }; - const result = await strategy.processItem(item, options); - - expect(result.title).toBe("config.txt"); - }); - - it("should handle unsupported content types", async () => { - const rawContent: RawContent = { - content: "binary content", - mimeType: "application/octet-stream", - source: "https://raw.githubusercontent.com/owner/repo/main/binary.bin", - charset: "utf-8", - status: FetchStatus.SUCCESS, - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(false); - markdownPipelineInstance.canProcess.mockReturnValue(false); - - const item = { url: "github-file://binary.bin", depth: 1 }; - const result = await strategy.processItem(item, options); - - expect(result.content).toBeUndefined(); - expect(result.links).toEqual([]); - }); - }); - - describe("shouldProcessFile", () => { - const options: ScraperOptions = { - url: "https://github.com/owner/repo", - library: "test-lib", - version: "1.0.0", - }; - - it("should process text files", () => { - const textFiles = [ - { path: "README.md", type: "blob" as const }, - { path: "src/index.js", type: "blob" as const }, - { path: "docs/guide.rst", type: "blob" as const }, - { path: "package.json", type: "blob" as const }, - { path: "Dockerfile", type: "blob" as const }, - // Note: LICENSE files are excluded by default patterns, so we don't test it here - ]; - - for (const file of textFiles) { - // @ts-expect-error Accessing private method for testing - expect(strategy.shouldProcessFile(file, options)).toBe(true); - } - }); - - it("should skip binary files", () => { - const binaryFiles = [ - { path: "image.png", type: "blob" as const }, - { path: "video.mp4", type: "blob" as const }, - { path: "archive.zip", type: "blob" as const }, - { path: "binary.exe", type: "blob" as const }, - ]; - - for (const file of binaryFiles) { - // @ts-expect-error Accessing private method for testing - expect(strategy.shouldProcessFile(file, options)).toBe(false); - } - }); - - it("should skip tree items", () => { - const treeItem = { path: "src", type: "tree" as const }; - // @ts-expect-error Accessing private method for testing - expect(strategy.shouldProcessFile(treeItem, options)).toBe(false); - }); - - it("should respect include patterns", () => { - const optionsWithInclude = { - ...options, - includePatterns: ["*.md", "src/*"], - }; - - expect( - // @ts-expect-error Accessing private method for testing - strategy.shouldProcessFile( - { - path: "README.md", - type: "blob" as const, - sha: "abc123", - url: "https://api.github.com/repos/owner/repo/git/blobs/abc123", - }, - optionsWithInclude, - ), - ).toBe(true); - expect( - // @ts-expect-error Accessing private method for testing - strategy.shouldProcessFile( - { - path: "src/index.js", - type: "blob" as const, - sha: "def456", - url: "https://api.github.com/repos/owner/repo/git/blobs/def456", - }, - optionsWithInclude, - ), - ).toBe(true); - expect( - // @ts-expect-error Accessing private method for testing - strategy.shouldProcessFile( - { - path: "package.json", - type: "blob" as const, - sha: "ghi789", - url: "https://api.github.com/repos/owner/repo/git/blobs/ghi789", - }, - optionsWithInclude, - ), - ).toBe(false); - }); - - it("should respect exclude patterns", () => { - const optionsWithExclude = { - ...options, - excludePatterns: ["**/*.test.js", "node_modules/**"], - }; - - expect( - // @ts-expect-error Accessing private method for testing - strategy.shouldProcessFile( - { - path: "src/index.js", - type: "blob" as const, - sha: "abc123", - url: "https://api.github.com/repos/owner/repo/git/blobs/abc123", - }, - optionsWithExclude, - ), - ).toBe(true); - expect( - // @ts-expect-error Accessing private method for testing - strategy.shouldProcessFile( - { - path: "src/index.test.js", - type: "blob" as const, - sha: "def456", - url: "https://api.github.com/repos/owner/repo/git/blobs/def456", - }, - optionsWithExclude, - ), - ).toBe(false); - expect( - // @ts-expect-error Accessing private method for testing - strategy.shouldProcessFile( - { - path: "node_modules/package/index.js", - type: "blob" as const, - sha: "ghi789", - url: "https://api.github.com/repos/owner/repo/git/blobs/ghi789", - }, - optionsWithExclude, - ), - ).toBe(false); - }); - }); - - describe("cleanup", () => { - it("should cleanup pipeline resources", async () => { - await strategy.cleanup(); - expect(htmlPipelineInstance.close).toHaveBeenCalled(); - expect(markdownPipelineInstance.close).toHaveBeenCalled(); - }); - }); -}); diff --git a/src/scraper/strategies/GitHubRepoScraperStrategy.ts b/src/scraper/strategies/GitHubRepoScraperStrategy.ts deleted file mode 100644 index 770a981c..00000000 --- a/src/scraper/strategies/GitHubRepoScraperStrategy.ts +++ /dev/null @@ -1,529 +0,0 @@ -import type { ProgressCallback } from "../../types"; -import { logger } from "../../utils/logger"; -import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; -import { HttpFetcher } from "../fetcher"; -import { FetchStatus, type RawContent } from "../fetcher/types"; -import { PipelineFactory } from "../pipelines/PipelineFactory"; -import type { ContentPipeline, PipelineResult } from "../pipelines/types"; -import type { QueueItem } from "../types"; -import { ScrapeMode, type ScraperOptions, type ScraperProgressEvent } from "../types"; -import { shouldIncludeUrl } from "../utils/patternMatcher"; -import { BaseScraperStrategy, type ProcessItemResult } from "./BaseScraperStrategy"; - -interface GitHubRepoInfo { - owner: string; - repo: string; - branch?: string; - subPath?: string; -} - -interface GitHubTreeItem { - path: string; - type: "blob" | "tree"; - sha: string; - size?: number; - url: string; -} - -interface GitHubTreeResponse { - sha: string; - url: string; - tree: GitHubTreeItem[]; - truncated: boolean; -} - -/** - * GitHubRepoScraperStrategy handles native repository crawling by accessing GitHub's tree API - * to discover repository structure and fetching raw file contents. This treats repositories - * more like file systems rather than web pages. - * - * Features: - * - Uses GitHub tree API for efficient repository structure discovery - * - Fetches raw file contents from raw.githubusercontent.com - * - Processes all text files (source code, markdown, documentation, etc.) - * - Supports branch-specific crawling (defaults to main/default branch) - * - Automatically detects repository default branch when no branch specified - * - Respects repository subpath URLs (e.g., /tree//docs) by limiting indexed files - * - Filters out binary files and processes only text-based content - * - * Note: Wiki pages are not currently supported in this native mode. For wiki access, - * consider using the web scraping approach or a separate scraping job. - */ -export class GitHubRepoScraperStrategy extends BaseScraperStrategy { - private readonly httpFetcher = new HttpFetcher(); - private readonly pipelines: ContentPipeline[]; - private resolvedBranch?: string; // Cache the resolved default branch - - constructor() { - super(); - this.pipelines = PipelineFactory.createStandardPipelines(); - } - - canHandle(url: string): boolean { - const { hostname } = new URL(url); - return ["github.com", "www.github.com"].includes(hostname); - } - - /** - * Override shouldProcessUrl to handle github-file:// URLs specially. - * These URLs bypass scope checking since they're internal file references. - */ - protected shouldProcessUrl(url: string, options: ScraperOptions): boolean { - // For github-file:// URLs, only apply include/exclude patterns, skip scope checking - if (url.startsWith("github-file://")) { - const filePath = url.replace("github-file://", ""); - return shouldIncludeUrl(filePath, options.includePatterns, options.excludePatterns); - } - - // For regular URLs, use the base implementation - return super.shouldProcessUrl(url, options); - } - - /** - * Parses a GitHub URL to extract repository information. - */ - parseGitHubUrl(url: string): GitHubRepoInfo & { isBlob?: boolean; filePath?: string } { - const parsedUrl = new URL(url); - // Extract // from github.com///... - const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)/); - if (!match) { - throw new Error(`Invalid GitHub repository URL: ${url}`); - } - - const [, owner, repo] = match; - - // Extract branch and optional subpath from URLs like /tree// - const segments = parsedUrl.pathname.split("/").filter(Boolean); - - // Handle /blob/ URLs for single file indexing - if (segments.length >= 4 && segments[2] === "blob") { - const branch = segments[3]; - const filePath = segments.length > 4 ? segments.slice(4).join("/") : undefined; - return { owner, repo, branch, filePath, isBlob: true }; - } - - // Only handle URLs of the form /owner/repo/tree/branch/subPath - if (segments.length < 4 || segments[2] !== "tree") { - // Unsupported format (missing branch, or not a tree/blob URL) - return { owner, repo }; - } - - const branch = segments[3]; - const subPath = segments.length > 4 ? segments.slice(4).join("/") : undefined; - - return { owner, repo, branch, subPath }; - } - - /** - * Fetches the repository tree structure from GitHub API. - * Uses 'HEAD' to get the default branch if no branch is specified. - */ - async fetchRepositoryTree( - repoInfo: GitHubRepoInfo, - signal?: AbortSignal, - ): Promise<{ tree: GitHubTreeResponse; resolvedBranch: string }> { - const { owner, repo, branch } = repoInfo; - - // If no branch specified, fetch the default branch first - let targetBranch = branch; - if (!targetBranch) { - try { - // Get repository information to find the default branch - const repoUrl = `https://api.github.com/repos/${owner}/${repo}`; - logger.debug(`Fetching repository info: ${repoUrl}`); - - const repoContent = await this.httpFetcher.fetch(repoUrl, { signal }); - const content = - typeof repoContent.content === "string" - ? repoContent.content - : repoContent.content.toString("utf-8"); - const repoData = JSON.parse(content) as { default_branch: string }; - targetBranch = repoData.default_branch; - - logger.debug(`Using default branch: ${targetBranch}`); - } catch (error) { - logger.warn(`⚠️ Could not fetch default branch, using 'main': ${error}`); - targetBranch = "main"; - } - } - - // Cache the resolved branch for file fetching - this.resolvedBranch = targetBranch; - - const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`; - - logger.debug(`Fetching repository tree: ${treeUrl}`); - - const rawContent = await this.httpFetcher.fetch(treeUrl, { signal }); - const content = - typeof rawContent.content === "string" - ? rawContent.content - : rawContent.content.toString("utf-8"); - const treeData = JSON.parse(content) as GitHubTreeResponse; - - if (treeData.truncated) { - logger.warn( - `⚠️ Repository tree was truncated for ${owner}/${repo}. Some files may be missing.`, - ); - } - - return { tree: treeData, resolvedBranch: targetBranch }; - } - - /** - * Determines if a file should be processed based on its path and type. - */ - private shouldProcessFile(item: GitHubTreeItem, options: ScraperOptions): boolean { - // Only process blob (file) items, not trees (directories) - if (item.type !== "blob") { - return false; - } - - const path = item.path; - - // Whitelist of text-based file extensions that we can process - const textExtensions = [ - // Documentation - ".md", - ".mdx", - ".txt", - ".rst", - ".adoc", - ".asciidoc", - - // Web technologies - ".html", - ".htm", - ".xml", - ".css", - ".scss", - ".sass", - ".less", - - // Programming languages - ".js", - ".jsx", - ".ts", - ".tsx", - ".py", - ".java", - ".c", - ".cpp", - ".cc", - ".cxx", - ".h", - ".hpp", - ".cs", - ".go", - ".rs", - ".rb", - ".php", - ".swift", - ".kt", - ".scala", - ".clj", - ".cljs", - ".hs", - ".elm", - ".dart", - ".r", - ".m", - ".mm", - ".sh", - ".bash", - ".zsh", - ".fish", - ".ps1", - ".bat", - ".cmd", - - // Configuration and data - ".json", - ".yaml", - ".yml", - ".toml", - ".ini", - ".cfg", - ".conf", - ".properties", - ".env", - ".gitignore", - ".dockerignore", - ".gitattributes", - ".editorconfig", - - // Build and package management - ".gradle", - ".pom", - ".sbt", - ".maven", - ".cmake", - ".make", - ".dockerfile", - ".mod", // Go modules (go.mod) - ".sum", // Go checksums (go.sum) - - // Other text formats - ".sql", - ".graphql", - ".gql", - ".proto", - ".thrift", - ".avro", - ".csv", - ".tsv", - ".log", - ]; - - const pathLower = path.toLowerCase(); - - // Check for known text extensions - const hasTextExtension = textExtensions.some((ext) => pathLower.endsWith(ext)); - - // Check for compound extensions and special cases - const hasCompoundExtension = - pathLower.includes(".env.") || // .env.example, .env.local, etc. - pathLower.endsWith(".env") || - pathLower.includes(".config.") || // webpack.config.js, etc. - pathLower.includes(".lock"); // package-lock.json, etc. - - // Also include files without extensions that are commonly text files - const fileName = path.split("/").pop() || ""; - const fileNameLower = fileName.toLowerCase(); - const commonTextFiles = [ - // Documentation files without extensions - "readme", - "license", - "changelog", - "contributing", - "authors", - "maintainers", - - // Build files without extensions - "dockerfile", - "makefile", - "rakefile", - "gemfile", - "podfile", - "cartfile", - "brewfile", - "procfile", - "vagrantfile", - "gulpfile", - "gruntfile", - - // Configuration files (dotfiles) - ".prettierrc", - ".eslintrc", - ".babelrc", - ".nvmrc", - ".npmrc", - ]; - - const isCommonTextFile = commonTextFiles.some((name) => { - if (name.startsWith(".")) { - // For dotfiles, match exactly or with additional extension (e.g., .prettierrc.js) - return fileNameLower === name || fileNameLower.startsWith(`${name}.`); - } - // For regular files, match exactly or with extension - return fileNameLower === name || fileNameLower.startsWith(`${name}.`); - }); - - // Process file if it has a text extension, compound extension, or is a common text file - if (!hasTextExtension && !hasCompoundExtension && !isCommonTextFile) { - return false; - } - - // Apply user-defined include/exclude patterns (use the file path directly) - return shouldIncludeUrl(path, options.includePatterns, options.excludePatterns); - } - - /** - * Fetches the raw content of a file from GitHub. - */ - async fetchFileContent( - repoInfo: GitHubRepoInfo, - filePath: string, - signal?: AbortSignal, - ): Promise { - const { owner, repo } = repoInfo; - // Use resolved branch if available, otherwise use provided branch or default to main - const branch = this.resolvedBranch || repoInfo.branch || "main"; - const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`; - - const rawContent = await this.httpFetcher.fetch(rawUrl, { signal }); - - // Override GitHub's generic 'text/plain' MIME type with file extension-based detection - const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath); - if (detectedMimeType && rawContent.mimeType === "text/plain") { - return { - ...rawContent, - mimeType: detectedMimeType, - }; - } - - return rawContent; - } - - async processItem( - item: QueueItem, - options: ScraperOptions, - signal?: AbortSignal, - ): Promise { - // Parse the URL to get repository information - const repoInfo = this.parseGitHubUrl(options.url); - const pageCount = this.pageCount; - - // For the initial item, handle blob URLs differently than tree URLs - if (item.depth === 0) { - // Handle single file (blob) URLs - if ("isBlob" in repoInfo && repoInfo.isBlob) { - if (repoInfo.filePath) { - logger.info( - `📄 Processing single file: ${repoInfo.owner}/${repoInfo.repo}/${repoInfo.filePath}`, - ); - - // Process the single file directly - return { - url: item.url, - links: [`github-file://${repoInfo.filePath}`], - status: FetchStatus.SUCCESS, - }; - } else { - // Blob URL without file path - return empty links - logger.warn( - `⚠️ Blob URL without file path: ${options.url}. No files to process.`, - ); - return { url: item.url, links: [], status: FetchStatus.SUCCESS }; - } - } - - // Handle repository tree crawling (existing logic) - logger.info( - `🗂️ Discovering repository structure for ${repoInfo.owner}/${repoInfo.repo}`, - ); - - const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal); - const fileItems = tree.tree - .filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)) - .filter((treeItem) => this.shouldProcessFile(treeItem, options)); - - logger.info( - `📁 Found ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`, - ); - - // Convert tree items to URLs for the queue - const links = fileItems.map((treeItem) => `github-file://${treeItem.path}`); - - return { url: item.url, links, status: FetchStatus.SUCCESS }; - } - - // Process individual files - if (item.url.startsWith("github-file://")) { - const filePath = item.url.replace("github-file://", ""); - - logger.info(`🗂️ Processing file ${pageCount}/${options.maxPages}: ${filePath}`); - - const rawContent = await this.fetchFileContent(repoInfo, filePath, signal); - - // Process content through appropriate pipeline - let processed: PipelineResult | undefined; - - for (const pipeline of this.pipelines) { - const contentBuffer = Buffer.isBuffer(rawContent.content) - ? rawContent.content - : Buffer.from(rawContent.content); - if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) { - logger.debug( - `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`, - ); - - // Force 'fetch' mode for GitHub to avoid unnecessary Playwright usage on raw content. - // GitHub raw files (e.g., HTML files) don't have their dependencies available at the - // raw.githubusercontent.com domain, so rendering them in a browser would be broken - // and provide no additional value over direct HTML parsing with Cheerio. - const gitHubOptions = { ...options, scrapeMode: ScrapeMode.Fetch }; - - processed = await pipeline.process(rawContent, gitHubOptions, this.httpFetcher); - break; - } - } - - if (!processed) { - logger.warn( - `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`, - ); - return { url: item.url, links: [], status: FetchStatus.SUCCESS }; - } - - for (const err of processed.errors ?? []) { - logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`); - } - - // Create document with GitHub-specific metadata - const githubUrl = `https://github.com/${repoInfo.owner}/${repoInfo.repo}/blob/${this.resolvedBranch || repoInfo.branch || "main"}/${filePath}`; - - // Use filename as fallback if title is empty or not a string - const filename = filePath.split("/").pop() || "Untitled"; - - return { - url: githubUrl, - title: processed.title?.trim() || filename || "Untitled", - etag: rawContent.etag, - lastModified: rawContent.lastModified, - contentType: rawContent.mimeType, - content: processed, - links: [], // Always return empty links array for individual files - status: FetchStatus.SUCCESS, - }; - } - - return { url: item.url, links: [], status: FetchStatus.SUCCESS }; - } - - /** - * Normalize a path by removing leading and trailing slashes. - */ - private normalizePath(path: string): string { - return path.replace(/^\/+/, "").replace(/\/+$/, ""); - } - - private isWithinSubPath(path: string, subPath?: string): boolean { - if (!subPath) { - return true; - } - - const trimmedSubPath = this.normalizePath(subPath); - if (trimmedSubPath.length === 0) { - return true; - } - - const normalizedPath = this.normalizePath(path); - if (normalizedPath === trimmedSubPath) { - return true; - } - - return normalizedPath.startsWith(`${trimmedSubPath}/`); - } - - async scrape( - options: ScraperOptions, - progressCallback: ProgressCallback, - signal?: AbortSignal, - ): Promise { - // Validate it's a GitHub URL - const url = new URL(options.url); - if (!url.hostname.includes("github.com")) { - throw new Error("URL must be a GitHub URL"); - } - - return super.scrape(options, progressCallback, signal); - } - - /** - * Cleanup resources used by this strategy, specifically the pipeline browser instances. - */ - async cleanup(): Promise { - await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close())); - } -} diff --git a/src/scraper/strategies/GitHubScraperStrategy.test.ts b/src/scraper/strategies/GitHubScraperStrategy.test.ts index bcddfdf7..5a283239 100644 --- a/src/scraper/strategies/GitHubScraperStrategy.test.ts +++ b/src/scraper/strategies/GitHubScraperStrategy.test.ts @@ -1,40 +1,25 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; -import type { ProgressCallback } from "../../types"; -import type { ScraperProgressEvent } from "../types"; -import { GitHubRepoScraperStrategy } from "./GitHubRepoScraperStrategy"; +import { FetchStatus, HttpFetcher } from "../fetcher"; +import type { ScraperOptions } from "../types"; import { GitHubScraperStrategy } from "./GitHubScraperStrategy"; -import { GitHubWikiScraperStrategy } from "./GitHubWikiScraperStrategy"; -// Mock the underlying strategies -vi.mock("./GitHubRepoScraperStrategy"); -vi.mock("./GitHubWikiScraperStrategy"); +// Mock the dependencies +vi.mock("../fetcher"); -const mockRepoStrategy = vi.mocked(GitHubRepoScraperStrategy); -const mockWikiStrategy = vi.mocked(GitHubWikiScraperStrategy); +const mockHttpFetcher = vi.mocked(HttpFetcher); describe("GitHubScraperStrategy", () => { let strategy: GitHubScraperStrategy; - let repoStrategyInstance: any; - let wikiStrategyInstance: any; + let httpFetcherInstance: any; beforeEach(() => { vi.clearAllMocks(); - // Setup repo strategy mock - repoStrategyInstance = { - canHandle: vi.fn(), - scrape: vi.fn(), - cleanup: vi.fn(), + // Setup fetcher mock + httpFetcherInstance = { + fetch: vi.fn(), }; - mockRepoStrategy.mockImplementation(() => repoStrategyInstance); - - // Setup wiki strategy mock - wikiStrategyInstance = { - canHandle: vi.fn(), - scrape: vi.fn(), - cleanup: vi.fn(), - }; - mockWikiStrategy.mockImplementation(() => wikiStrategyInstance); + mockHttpFetcher.mockImplementation(() => httpFetcherInstance); strategy = new GitHubScraperStrategy(); }); @@ -42,18 +27,24 @@ describe("GitHubScraperStrategy", () => { describe("canHandle", () => { it("should handle base GitHub repository URLs", () => { expect(strategy.canHandle("https://github.com/owner/repo")).toBe(true); - expect(strategy.canHandle("https://github.com/owner/repo/")).toBe(true); expect(strategy.canHandle("https://www.github.com/owner/repo")).toBe(true); + expect(strategy.canHandle("https://github.com/owner/repo/")).toBe(true); }); - it("should not handle GitHub URLs with specific paths", () => { - expect(strategy.canHandle("https://github.com/owner/repo/wiki")).toBe(false); - expect(strategy.canHandle("https://github.com/owner/repo/wiki/Home")).toBe(false); - expect(strategy.canHandle("https://github.com/owner/repo/tree/main")).toBe(false); + it("should handle tree URLs with branch", () => { + expect(strategy.canHandle("https://github.com/owner/repo/tree/main")).toBe(true); + expect(strategy.canHandle("https://github.com/owner/repo/tree/develop/src")).toBe( + true, + ); + }); + + it("should handle blob URLs with file paths", () => { expect( strategy.canHandle("https://github.com/owner/repo/blob/main/README.md"), - ).toBe(false); - expect(strategy.canHandle("https://github.com/owner/repo/issues")).toBe(false); + ).toBe(true); + expect( + strategy.canHandle("https://github.com/owner/repo/blob/main/src/index.js"), + ).toBe(true); }); it("should not handle non-GitHub URLs", () => { @@ -62,105 +53,350 @@ describe("GitHubScraperStrategy", () => { expect(strategy.canHandle("https://example.com")).toBe(false); }); - it("should not handle invalid URLs", () => { - expect(strategy.canHandle("invalid-url")).toBe(false); - expect(strategy.canHandle("")).toBe(false); + it("should not handle GitHub wiki URLs", () => { + expect(strategy.canHandle("https://github.com/owner/repo/wiki")).toBe(false); + expect(strategy.canHandle("https://github.com/owner/repo/wiki/Page")).toBe(false); }); - }); - // Note: shouldProcessUrl is a protected method that delegates to underlying strategies, - // but it's mainly used internally. The most important behavior is tested via the scrape() method. - - describe("scrape", () => { - it("should orchestrate both repo and wiki scraping", async () => { - const options = { - url: "https://github.com/owner/repo", - library: "test-lib", - version: "1.0.0", - }; + it("should not handle other GitHub paths", () => { + expect(strategy.canHandle("https://github.com/owner/repo/issues")).toBe(false); + expect(strategy.canHandle("https://github.com/owner/repo/pulls")).toBe(false); + }); + }); - const progressCallback = vi.fn>(); + describe("parseGitHubUrl", () => { + it("should parse basic repository URL", () => { + const result = (strategy as any).parseGitHubUrl("https://github.com/owner/repo"); + expect(result).toEqual({ owner: "owner", repo: "repo" }); + }); - repoStrategyInstance.scrape.mockResolvedValue(undefined); - wikiStrategyInstance.scrape.mockResolvedValue(undefined); + it("should parse tree URL with branch", () => { + const result = (strategy as any).parseGitHubUrl( + "https://github.com/owner/repo/tree/main", + ); + expect(result).toEqual({ owner: "owner", repo: "repo", branch: "main" }); + }); - await strategy.scrape(options, progressCallback); + it("should parse tree URL with branch and subpath", () => { + const result = (strategy as any).parseGitHubUrl( + "https://github.com/owner/repo/tree/main/docs", + ); + expect(result).toEqual({ + owner: "owner", + repo: "repo", + branch: "main", + subPath: "docs", + }); + }); - // Should scrape wiki first (prioritized) - expect(wikiStrategyInstance.scrape).toHaveBeenCalledWith( - expect.objectContaining({ - ...options, - url: "https://github.com/owner/repo/wiki", - }), - expect.any(Function), - undefined, + it("should parse blob URL with file", () => { + const result = (strategy as any).parseGitHubUrl( + "https://github.com/owner/repo/blob/main/README.md", ); + expect(result).toEqual({ + owner: "owner", + repo: "repo", + branch: "main", + filePath: "README.md", + isBlob: true, + }); + }); - // Should then scrape repository with adjusted maxPages - expect(repoStrategyInstance.scrape).toHaveBeenCalledWith( - expect.objectContaining({ - ...options, - maxPages: 1000, // Default maxPages since no wiki pages were scraped in mock - }), - expect.any(Function), - undefined, + it("should parse blob URL with nested file path", () => { + const result = (strategy as any).parseGitHubUrl( + "https://github.com/owner/repo/blob/main/src/index.js", ); + expect(result).toEqual({ + owner: "owner", + repo: "repo", + branch: "main", + filePath: "src/index.js", + isBlob: true, + }); }); - it("should handle wiki scraping failure gracefully", async () => { - const options = { - url: "https://github.com/owner/repo", - library: "test-lib", - version: "1.0.0", - }; + it("should throw error for invalid repository URL", () => { + expect(() => { + (strategy as any).parseGitHubUrl("https://github.com/invalid"); + }).toThrow("Invalid GitHub repository URL"); + }); + }); + + describe("shouldProcessFile", () => { + const options: ScraperOptions = { + url: "https://github.com/owner/repo", + library: "test-lib", + version: "1.0.0", + }; - const progressCallback = vi.fn>(); + it("should process text files with common extensions", () => { + const textFiles = [ + { path: "README.md", type: "blob" as const }, + { path: "src/index.js", type: "blob" as const }, + { path: "docs/guide.rst", type: "blob" as const }, + { path: "package.json", type: "blob" as const }, + { path: "config.yaml", type: "blob" as const }, + { path: "script.py", type: "blob" as const }, + ]; - repoStrategyInstance.scrape.mockResolvedValue(undefined); - wikiStrategyInstance.scrape.mockRejectedValue(new Error("Wiki not found")); + for (const file of textFiles) { + // @ts-expect-error Accessing private method for testing + expect(strategy.shouldProcessFile(file, options)).toBe(true); + } + }); - // Should not throw error when wiki fails - await expect(strategy.scrape(options, progressCallback)).resolves.toBeUndefined(); + it("should process common text files without extensions", () => { + const commonFiles = [ + { path: "Dockerfile", type: "blob" as const }, + { path: "Makefile", type: "blob" as const }, + { path: "README", type: "blob" as const }, + { path: "CHANGELOG", type: "blob" as const }, + ]; - expect(repoStrategyInstance.scrape).toHaveBeenCalled(); - expect(wikiStrategyInstance.scrape).toHaveBeenCalled(); + for (const file of commonFiles) { + // @ts-expect-error Accessing private method for testing + expect(strategy.shouldProcessFile(file, options)).toBe(true); + } }); - it("should validate GitHub URLs", async () => { - const options = { - url: "https://example.com/owner/repo", - library: "test-lib", - version: "1.0.0", - }; + it("should process config files", () => { + const configFiles = [ + { path: ".prettierrc", type: "blob" as const }, + { path: ".eslintrc", type: "blob" as const }, + { path: ".babelrc", type: "blob" as const }, + { path: ".env", type: "blob" as const }, + { path: ".env.local", type: "blob" as const }, + ]; - const progressCallback = vi.fn>(); + for (const file of configFiles) { + // @ts-expect-error Accessing private method for testing + expect(strategy.shouldProcessFile(file, options)).toBe(true); + } + }); - await expect(strategy.scrape(options, progressCallback)).rejects.toThrow( - "URL must be a GitHub URL", - ); + it("should skip binary files", () => { + const binaryFiles = [ + { path: "image.png", type: "blob" as const }, + { path: "video.mp4", type: "blob" as const }, + { path: "archive.zip", type: "blob" as const }, + { path: "binary.exe", type: "blob" as const }, + { path: "lib.so", type: "blob" as const }, + { path: "app.dmg", type: "blob" as const }, + ]; + + for (const file of binaryFiles) { + // @ts-expect-error Accessing private method for testing + expect(strategy.shouldProcessFile(file, options)).toBe(false); + } + }); + + it("should skip tree items (directories)", () => { + const treeItem = { path: "src", type: "tree" as const }; + // @ts-expect-error Accessing private method for testing + expect(strategy.shouldProcessFile(treeItem, options)).toBe(false); }); - it("should validate repository URL format", async () => { - const options = { - url: "https://github.com/owner/repo/tree/main", - library: "test-lib", - version: "1.0.0", + it("should respect include patterns", () => { + const optionsWithInclude = { + ...options, + includePatterns: ["*.md", "src/**"], }; - const progressCallback = vi.fn>(); + expect( + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { path: "README.md", type: "blob" as const, sha: "abc", url: "" }, + optionsWithInclude, + ), + ).toBe(true); + expect( + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { path: "src/index.js", type: "blob" as const, sha: "def", url: "" }, + optionsWithInclude, + ), + ).toBe(true); + expect( + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { path: "package.json", type: "blob" as const, sha: "ghi", url: "" }, + optionsWithInclude, + ), + ).toBe(false); + }); - await expect(strategy.scrape(options, progressCallback)).rejects.toThrow( - "URL must be a base GitHub repository URL", - ); + it("should respect exclude patterns", () => { + const optionsWithExclude = { + ...options, + excludePatterns: ["**/*.test.js", "node_modules/**"], + }; + + expect( + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { path: "src/index.js", type: "blob" as const, sha: "abc", url: "" }, + optionsWithExclude, + ), + ).toBe(true); + expect( + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { path: "src/index.test.js", type: "blob" as const, sha: "def", url: "" }, + optionsWithExclude, + ), + ).toBe(false); + expect( + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { + path: "node_modules/package/index.js", + type: "blob" as const, + sha: "ghi", + url: "", + }, + optionsWithExclude, + ), + ).toBe(false); + }); + }); + + describe("isWithinSubPath", () => { + it("should return true when no subPath is specified", () => { + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("any/path", undefined)).toBe(true); + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("any/path", "")).toBe(true); + }); + + it("should return true for exact subPath match", () => { + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("docs", "docs")).toBe(true); + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("src/lib", "src/lib")).toBe(true); + }); + + it("should return true for paths within subPath", () => { + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("docs/guide.md", "docs")).toBe(true); + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("src/lib/index.js", "src/lib")).toBe(true); + }); + + it("should return false for paths outside subPath", () => { + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("README.md", "docs")).toBe(false); + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("src/index.js", "docs")).toBe(false); + }); + + it("should handle trailing slashes correctly", () => { + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("docs/guide.md", "docs/")).toBe(true); + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("docs/guide.md", "/docs")).toBe(true); + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("docs/guide.md", "/docs/")).toBe(true); }); }); - describe("cleanup", () => { - it("should cleanup both underlying strategies", async () => { - await strategy.cleanup(); + describe("processItem", () => { + const options: ScraperOptions = { + url: "https://github.com/owner/repo", + library: "test-lib", + version: "1.0.0", + }; + + beforeEach(() => { + // Mock default branch fetch + httpFetcherInstance.fetch.mockImplementation((url: string) => { + if (url.includes("api.github.com/repos/") && !url.includes("/git/trees/")) { + return Promise.resolve({ + content: JSON.stringify({ default_branch: "main" }), + mimeType: "application/json", + source: url, + charset: "utf-8", + status: FetchStatus.SUCCESS, + }); + } + if (url.includes("/git/trees/")) { + return Promise.resolve({ + content: JSON.stringify({ + sha: "tree123", + url: "https://api.github.com/repos/owner/repo/git/trees/tree123", + tree: [ + { + path: "README.md", + type: "blob", + sha: "abc123", + size: 1024, + url: "...", + }, + { + path: "src/index.js", + type: "blob", + sha: "def456", + size: 512, + url: "...", + }, + { + path: "image.png", + type: "blob", + sha: "ghi789", + size: 2048, + url: "...", + }, + ], + truncated: false, + }), + mimeType: "application/json", + source: url, + charset: "utf-8", + status: FetchStatus.SUCCESS, + }); + } + return Promise.resolve({ + content: "file content", + mimeType: "text/plain", + source: url, + charset: "utf-8", + status: FetchStatus.SUCCESS, + }); + }); + }); + + it("should discover files and return HTTPS blob URLs", async () => { + const item = { url: "https://github.com/owner/repo", depth: 0 }; + const result = await strategy.processItem(item, options); + + expect(result.status).toBe(FetchStatus.SUCCESS); + expect(result.links).toContain("https://github.com/owner/repo/blob/main/README.md"); + expect(result.links).toContain( + "https://github.com/owner/repo/blob/main/src/index.js", + ); + expect(result.links).not.toContain( + "https://github.com/owner/repo/blob/main/image.png", + ); + }); + + it("should return empty links for non-depth-0 items", async () => { + const item = { url: "https://github.com/owner/repo", depth: 1 }; + const result = await strategy.processItem(item, options); + + expect(result.status).toBe(FetchStatus.SUCCESS); + expect(result.links).toEqual([]); + }); + + it("should handle single blob file URLs with strict scoping", async () => { + const blobOptions = { + ...options, + url: "https://github.com/owner/repo/blob/main/README.md", + }; + const item = { url: "https://github.com/owner/repo/blob/main/README.md", depth: 0 }; + const result = await strategy.processItem(item, blobOptions); - expect(repoStrategyInstance.cleanup).toHaveBeenCalled(); - expect(wikiStrategyInstance.cleanup).toHaveBeenCalled(); + expect(result.status).toBe(FetchStatus.SUCCESS); + // Strict scoping: blob URL should index ONLY that file, not discover wiki + expect(result.links).toEqual(["https://github.com/owner/repo/blob/main/README.md"]); }); }); }); diff --git a/src/scraper/strategies/GitHubScraperStrategy.ts b/src/scraper/strategies/GitHubScraperStrategy.ts index cfc1bea0..42a7438e 100644 --- a/src/scraper/strategies/GitHubScraperStrategy.ts +++ b/src/scraper/strategies/GitHubScraperStrategy.ts @@ -1,15 +1,26 @@ import type { ProgressCallback } from "../../types"; import { logger } from "../../utils/logger"; -import type { ScraperOptions, ScraperProgressEvent, ScraperStrategy } from "../types"; -import { GitHubRepoScraperStrategy } from "./GitHubRepoScraperStrategy"; -import { GitHubWikiScraperStrategy } from "./GitHubWikiScraperStrategy"; +import { HttpFetcher } from "../fetcher"; +import { FetchStatus } from "../fetcher/types"; +import type { QueueItem, ScraperOptions, ScraperProgressEvent } from "../types"; +import { shouldIncludeUrl } from "../utils/patternMatcher"; +import { BaseScraperStrategy, type ProcessItemResult } from "./BaseScraperStrategy"; +import type { + GitHubRepoInfo, + GitHubTreeItem, + GitHubTreeResponse, +} from "./GitHubRepoProcessor"; +import { GitHubRepoProcessor } from "./GitHubRepoProcessor"; +import { GitHubWikiProcessor } from "./GitHubWikiProcessor"; /** - * GitHubScraperStrategy is a composite strategy that orchestrates the scraping of both + * GitHubScraperStrategy is a discovery strategy that orchestrates the scraping of both * GitHub repository code and wiki pages. When given a GitHub repository URL, it will: * - * 1. Attempt to scrape the repository's wiki pages using GitHubWikiScraperStrategy (prioritized) - * 2. Scrape the repository's code files using GitHubRepoScraperStrategy (with remaining page budget) + * 1. Attempt to scrape the repository's wiki pages using GitHubWikiProcessor (prioritized) + * 2. Discover all repository files using the GitHub Tree API + * 3. Create HTTPS blob URLs for each file, which are stored in the database + * 4. Process blob URLs directly with GitHubRepoProcessor * * This provides comprehensive documentation coverage by including both wiki documentation * and source code in a single scraping job, with wikis prioritized as they typically @@ -17,131 +28,416 @@ import { GitHubWikiScraperStrategy } from "./GitHubWikiScraperStrategy"; * * Features: * - Handles base GitHub repository URLs (e.g., https://github.com/owner/repo) + * - Handles branch-specific URLs (e.g., https://github.com/owner/repo/tree/branch) + * - Handles single file URLs (e.g., https://github.com/owner/repo/blob/branch/path) + * - Discovers all files efficiently using GitHub's Tree API + * - Generates and processes user-friendly HTTPS blob URLs throughout * - Prioritizes wiki content over repository files for better documentation quality * - Respects maxPages limit across both scraping phases to prevent exceeding quotas * - Automatically discovers and scrapes both wiki and code content - * - Merges progress reporting from both sub-strategies * - Graceful handling when wikis don't exist or are inaccessible - * - Maintains all the capabilities of both underlying strategies */ -export class GitHubScraperStrategy implements ScraperStrategy { - private readonly repoStrategy = new GitHubRepoScraperStrategy(); - private readonly wikiStrategy = new GitHubWikiScraperStrategy(); +export class GitHubScraperStrategy extends BaseScraperStrategy { + private readonly httpFetcher = new HttpFetcher(); + private readonly wikiProcessor = new GitHubWikiProcessor(); + private readonly repoProcessor = new GitHubRepoProcessor(); canHandle(url: string): boolean { try { const parsedUrl = new URL(url); const { hostname, pathname } = parsedUrl; - // Only handle base GitHub repository URLs, not specific paths like /wiki/, /blob/, /tree/ + // Handle GitHub repository URLs if (!["github.com", "www.github.com"].includes(hostname)) { return false; } - // Check if it's a base repository URL (owner/repo format) - const pathMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/); - return pathMatch !== null; + // Handle base repository URLs (owner/repo) + const baseMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/); + if (baseMatch) { + return true; + } + + // Handle tree URLs (owner/repo/tree/branch/...) + const treeMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/tree\//); + if (treeMatch) { + return true; + } + + // Handle blob URLs (owner/repo/blob/branch/...) + const blobMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/blob\//); + if (blobMatch) { + return true; + } + + return false; } catch { return false; } } - async scrape( - options: ScraperOptions, - progressCallback: ProgressCallback, + /** + * Parses a GitHub URL to extract repository information. + */ + private parseGitHubUrl( + url: string, + ): GitHubRepoInfo & { isBlob?: boolean; filePath?: string } { + const parsedUrl = new URL(url); + // Extract // from github.com///... + const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)/); + if (!match) { + throw new Error(`Invalid GitHub repository URL: ${url}`); + } + + const [, owner, repo] = match; + + // Extract branch and optional subpath from URLs like /tree// + const segments = parsedUrl.pathname.split("/").filter(Boolean); + + // Handle /blob/ URLs for single file indexing + if (segments.length >= 4 && segments[2] === "blob") { + const branch = segments[3]; + const filePath = segments.length > 4 ? segments.slice(4).join("/") : undefined; + return { owner, repo, branch, filePath, isBlob: true }; + } + + // Handle /tree/ URLs with branch and optional subpath + if (segments.length >= 4 && segments[2] === "tree") { + const branch = segments[3]; + const subPath = segments.length > 4 ? segments.slice(4).join("/") : undefined; + return { owner, repo, branch, subPath }; + } + + // Base repository URL + return { owner, repo }; + } + + /** + * Fetches the repository tree structure from GitHub API. + */ + private async fetchRepositoryTree( + repoInfo: GitHubRepoInfo, signal?: AbortSignal, - ): Promise { - // Validate it's a GitHub URL - const url = new URL(options.url); - if (!url.hostname.includes("github.com")) { - throw new Error("URL must be a GitHub URL"); + ): Promise<{ tree: GitHubTreeResponse; resolvedBranch: string }> { + const { owner, repo, branch } = repoInfo; + + // If no branch specified, fetch the default branch first + let targetBranch = branch; + if (!targetBranch) { + try { + const repoUrl = `https://api.github.com/repos/${owner}/${repo}`; + logger.debug(`Fetching repository info: ${repoUrl}`); + + const repoContent = await this.httpFetcher.fetch(repoUrl, { signal }); + const content = + typeof repoContent.content === "string" + ? repoContent.content + : repoContent.content.toString("utf-8"); + const repoData = JSON.parse(content) as { default_branch: string }; + targetBranch = repoData.default_branch; + + logger.debug(`Using default branch: ${targetBranch}`); + } catch (error) { + logger.warn(`⚠️ Could not fetch default branch, using 'main': ${error}`); + targetBranch = "main"; + } } - // Parse the repository information - const pathMatch = url.pathname.match(/^\/([^/]+)\/([^/]+)\/?$/); - if (!pathMatch) { - throw new Error("URL must be a base GitHub repository URL"); + const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`; + logger.debug(`Fetching repository tree: ${treeUrl}`); + + const rawContent = await this.httpFetcher.fetch(treeUrl, { signal }); + const content = + typeof rawContent.content === "string" + ? rawContent.content + : rawContent.content.toString("utf-8"); + const treeData = JSON.parse(content) as GitHubTreeResponse; + + if (treeData.truncated) { + logger.warn( + `⚠️ Repository tree was truncated for ${owner}/${repo}. Some files may be missing.`, + ); } - const [, owner, repo] = pathMatch; - logger.info(`🚀 Starting comprehensive GitHub scraping for ${owner}/${repo}`); - - // We'll track progress from both strategies and merge them - let totalPagesDiscovered = 0; - let wikiPagesScraped = 0; - let wikiCompleted = false; - let repoCompleted = false; - - const mergedProgressCallback: ProgressCallback = async ( - progress, - ) => { - // For the first strategy (wiki), accumulate discovered pages and scraped count - if (!wikiCompleted) { - totalPagesDiscovered = progress.totalDiscovered; - wikiPagesScraped = progress.pagesScraped; - } else if (!repoCompleted) { - // For the second strategy (repo), create cumulative progress - progress = { - ...progress, - pagesScraped: wikiPagesScraped + progress.pagesScraped, - totalPages: wikiPagesScraped + progress.totalPages, - totalDiscovered: totalPagesDiscovered + progress.totalDiscovered, - }; + return { tree: treeData, resolvedBranch: targetBranch }; + } + + /** + * Determines if a file should be processed based on its path and type. + */ + private shouldProcessFile(item: GitHubTreeItem, options: ScraperOptions): boolean { + if (item.type !== "blob") { + return false; + } + + const path = item.path; + + // Whitelist of text-based file extensions + const textExtensions = [ + ".md", + ".mdx", + ".txt", + ".rst", + ".adoc", + ".asciidoc", + ".html", + ".htm", + ".xml", + ".css", + ".scss", + ".sass", + ".less", + ".js", + ".jsx", + ".ts", + ".tsx", + ".py", + ".java", + ".c", + ".cpp", + ".cc", + ".cxx", + ".h", + ".hpp", + ".cs", + ".go", + ".rs", + ".rb", + ".php", + ".swift", + ".kt", + ".scala", + ".clj", + ".cljs", + ".hs", + ".elm", + ".dart", + ".r", + ".m", + ".mm", + ".sh", + ".bash", + ".zsh", + ".fish", + ".ps1", + ".bat", + ".cmd", + ".json", + ".yaml", + ".yml", + ".toml", + ".ini", + ".cfg", + ".conf", + ".properties", + ".env", + ".gitignore", + ".dockerignore", + ".gitattributes", + ".editorconfig", + ".gradle", + ".pom", + ".sbt", + ".maven", + ".cmake", + ".make", + ".dockerfile", + ".mod", + ".sum", + ".sql", + ".graphql", + ".gql", + ".proto", + ".thrift", + ".avro", + ".csv", + ".tsv", + ".log", + ]; + + const pathLower = path.toLowerCase(); + const hasTextExtension = textExtensions.some((ext) => pathLower.endsWith(ext)); + const hasCompoundExtension = + pathLower.includes(".env.") || + pathLower.endsWith(".env") || + pathLower.includes(".config.") || + pathLower.includes(".lock"); + + const fileName = path.split("/").pop() || ""; + const fileNameLower = fileName.toLowerCase(); + const commonTextFiles = [ + "readme", + "license", + "changelog", + "contributing", + "authors", + "maintainers", + "dockerfile", + "makefile", + "rakefile", + "gemfile", + "podfile", + "cartfile", + "brewfile", + "procfile", + "vagrantfile", + "gulpfile", + "gruntfile", + ".prettierrc", + ".eslintrc", + ".babelrc", + ".nvmrc", + ".npmrc", + ]; + + const isCommonTextFile = commonTextFiles.some((name) => { + if (name.startsWith(".")) { + return fileNameLower === name || fileNameLower.startsWith(`${name}.`); } + return fileNameLower === name || fileNameLower.startsWith(`${name}.`); + }); - // Report the progress as-is and await completion - await progressCallback(progress); - }; + if (!hasTextExtension && !hasCompoundExtension && !isCommonTextFile) { + return false; + } - try { - // First, attempt to scrape the wiki (prioritized for better documentation) - const wikiUrl = `${options.url.replace(/\/$/, "")}/wiki`; - const wikiOptions = { ...options, url: wikiUrl }; + return shouldIncludeUrl(path, options.includePatterns, options.excludePatterns); + } - logger.info(`📖 Attempting to scrape wiki for ${owner}/${repo}`); + /** + * Checks if a path is within the specified subpath. + */ + private isWithinSubPath(path: string, subPath?: string): boolean { + if (!subPath) { + return true; + } - try { - // Check if the wiki exists by trying to access it - await this.wikiStrategy.scrape(wikiOptions, mergedProgressCallback, signal); - wikiCompleted = true; - logger.info( - `✅ Completed wiki scraping for ${owner}/${repo} (${wikiPagesScraped} pages)`, - ); - } catch (error) { - wikiCompleted = true; - logger.info(`ℹ️ Wiki not available or accessible for ${owner}/${repo}: ${error}`); - // Don't throw - wiki not existing is not a failure condition + const trimmedSubPath = subPath.replace(/^\/+/, "").replace(/\/+$/, ""); + if (trimmedSubPath.length === 0) { + return true; + } + + const normalizedPath = path.replace(/^\/+/, "").replace(/\/+$/, ""); + if (normalizedPath === trimmedSubPath) { + return true; + } + + return normalizedPath.startsWith(`${trimmedSubPath}/`); + } + + async processItem( + item: QueueItem, + options: ScraperOptions, + signal?: AbortSignal, + ): Promise { + // Delegate to wiki processor for wiki URLs + // Use precise pattern matching: /owner/repo/wiki or /owner/repo/wiki/ + try { + const parsedUrl = new URL(item.url); + if (/^\/[^/]+\/[^/]+\/wiki($|\/)/.test(parsedUrl.pathname)) { + return await this.wikiProcessor.process(item, options, signal); } + } catch { + // If URL parsing fails, fall through to other handlers + } + + // For the main repository URL (depth 0), perform discovery + // This includes blob URLs at depth 0, which should return themselves as discovered links + if (item.depth === 0) { + const repoInfo = this.parseGitHubUrl(options.url); + const { owner, repo } = repoInfo; - // Then, scrape the repository code with adjusted page limit - const maxPages = options.maxPages || 1000; - const remainingPages = Math.max(0, maxPages - wikiPagesScraped); + logger.debug(`Discovering GitHub repository ${owner}/${repo}`); - if (remainingPages > 0) { - logger.info( - `📂 Scraping repository code for ${owner}/${repo} (${remainingPages} pages remaining)`, + const discoveredLinks: string[] = []; + + // Handle single file (blob) URLs - strict scoping: index ONLY the file + if ("isBlob" in repoInfo && repoInfo.isBlob && repoInfo.filePath) { + const { branch = "main", filePath } = repoInfo; + logger.debug( + `Single file URL detected: ${owner}/${repo}/${filePath} - indexing file only`, ); - const repoOptions = { ...options, maxPages: remainingPages }; - await this.repoStrategy.scrape(repoOptions, mergedProgressCallback, signal); - repoCompleted = true; - logger.info(`✅ Completed repository code scraping for ${owner}/${repo}`); - } else { - logger.info( - `ℹ️ Skipping repository code scraping - page limit reached with wiki content`, + + // Generate HTTPS blob URL for storage + discoveredLinks.push( + `https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`, ); + + return { + url: item.url, + links: discoveredLinks, + status: FetchStatus.SUCCESS, + }; } - logger.info(`🎉 Comprehensive GitHub scraping completed for ${owner}/${repo}`); + // Discover wiki URL for full repo scrapes (will be processed by GitHubWikiScraperStrategy) + const wikiUrl = `${options.url.replace(/\/$/, "")}/wiki`; + discoveredLinks.push(wikiUrl); + logger.debug(`Discovered wiki URL: ${wikiUrl}`); + + // 3. Discover all files in the repository + const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal); + + const fileItems = tree.tree + .filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)) + .filter((treeItem) => this.shouldProcessFile(treeItem, options)); + + logger.debug( + `Discovered ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`, + ); + + // Create HTTPS blob URLs for storage in database + // These are user-friendly, clickable URLs that work outside the system + const fileUrls = fileItems.map( + (treeItem) => + `https://github.com/${owner}/${repo}/blob/${resolvedBranch}/${treeItem.path}`, + ); + + discoveredLinks.push(...fileUrls); + + logger.debug( + `Discovery complete: ${fileUrls.length} repo file(s) + 1 wiki URL = ${discoveredLinks.length} total URLs`, + ); + + return { url: item.url, links: discoveredLinks, status: FetchStatus.SUCCESS }; + } + + // Handle HTTPS blob URLs at depth > 0 (from database during refresh or discovered files) + // Process blob URLs directly - fetch content and return empty links + // Use precise pattern matching: /owner/repo/blob/branch/path + try { + const parsedUrl = new URL(item.url); + if (/^\/[^/]+\/[^/]+\/blob\//.test(parsedUrl.pathname)) { + logger.debug(`Processing HTTPS blob URL at depth ${item.depth}: ${item.url}`); + return await this.repoProcessor.process(item, options, signal); + } } catch (error) { - logger.error(`❌ GitHub scraping failed for ${owner}/${repo}: ${error}`); - throw error; + logger.warn(`⚠️ Failed to parse blob URL ${item.url}: ${error}`); + return { url: item.url, links: [], status: FetchStatus.SUCCESS }; } + + // For any other URLs at non-zero depth, return empty (shouldn't happen in practice) + logger.debug(`No further processing for URL at depth ${item.depth}: ${item.url}`); + return { url: item.url, links: [], status: FetchStatus.SUCCESS }; + } + + async scrape( + options: ScraperOptions, + progressCallback: ProgressCallback, + signal?: AbortSignal, + ): Promise { + const url = new URL(options.url); + if (!url.hostname.includes("github.com")) { + throw new Error("URL must be a GitHub URL"); + } + + // Use the base class implementation which handles initialQueue properly + // The processItem method will discover all wiki and repo file URLs + // The base scraper will automatically deduplicate URLs from initialQueue + await super.scrape(options, progressCallback, signal); } - /** - * Cleanup resources used by both underlying strategies. - */ async cleanup(): Promise { - await Promise.allSettled([this.repoStrategy.cleanup(), this.wikiStrategy.cleanup()]); + await Promise.all([this.wikiProcessor.cleanup(), this.repoProcessor.cleanup()]); } } diff --git a/src/scraper/strategies/GitHubWikiScraperStrategy.ts b/src/scraper/strategies/GitHubWikiProcessor.ts similarity index 68% rename from src/scraper/strategies/GitHubWikiScraperStrategy.ts rename to src/scraper/strategies/GitHubWikiProcessor.ts index 2c62ccab..de0cb6e4 100644 --- a/src/scraper/strategies/GitHubWikiScraperStrategy.ts +++ b/src/scraper/strategies/GitHubWikiProcessor.ts @@ -1,13 +1,12 @@ -import type { ProgressCallback } from "../../types"; import { logger } from "../../utils/logger"; import { HttpFetcher } from "../fetcher"; import { FetchStatus } from "../fetcher/types"; import { PipelineFactory } from "../pipelines/PipelineFactory"; import type { ContentPipeline, PipelineResult } from "../pipelines/types"; import type { QueueItem } from "../types"; -import { ScrapeMode, type ScraperOptions, type ScraperProgressEvent } from "../types"; +import { ScrapeMode, type ScraperOptions } from "../types"; import { shouldIncludeUrl } from "../utils/patternMatcher"; -import { BaseScraperStrategy, type ProcessItemResult } from "./BaseScraperStrategy"; +import type { ProcessItemResult } from "./BaseScraperStrategy"; interface GitHubWikiInfo { owner: string; @@ -15,7 +14,7 @@ interface GitHubWikiInfo { } /** - * GitHubWikiScraperStrategy handles scraping GitHub wiki pages using standard web scraping techniques. + * GitHubWikiProcessor handles scraping GitHub wiki pages using standard web scraping techniques. * GitHub wikis are separate from the main repository and are hosted at /wiki/ URLs. * * Features: @@ -24,34 +23,16 @@ interface GitHubWikiInfo { * - Processes wiki content as HTML/Markdown pages * - Stays within the wiki scope to avoid crawling the entire repository * - * Note: This strategy is specifically for /wiki/ URLs and does not handle regular repository files. + * This processor is stateless and contains the core logic from GitHubWikiScraperStrategy. */ -export class GitHubWikiScraperStrategy extends BaseScraperStrategy { +export class GitHubWikiProcessor { private readonly httpFetcher = new HttpFetcher(); private readonly pipelines: ContentPipeline[]; constructor() { - super(); this.pipelines = PipelineFactory.createStandardPipelines(); } - canHandle(url: string): boolean { - try { - const parsedUrl = new URL(url); - const { hostname, pathname } = parsedUrl; - - // Check if it's a GitHub URL and contains /wiki/ - // This should handle specific wiki URLs like /owner/repo/wiki/PageName - return ( - ["github.com", "www.github.com"].includes(hostname) && - pathname.includes("/wiki") && - pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/) !== null - ); - } catch { - return false; - } - } - /** * Parses a GitHub wiki URL to extract repository information. */ @@ -68,15 +49,17 @@ export class GitHubWikiScraperStrategy extends BaseScraperStrategy { } /** - * Override shouldProcessUrl to only process URLs within the wiki scope. + * Determines if a URL should be processed within the wiki scope. */ - protected shouldProcessUrl(url: string, options: ScraperOptions): boolean { + shouldProcessUrl(url: string, options: ScraperOptions): boolean { try { const parsedUrl = new URL(url); - const wikiInfo = this.parseGitHubWikiUrl(options.url); - const expectedWikiPath = `/${wikiInfo.owner}/${wikiInfo.repo}/wiki`; - // Only process URLs that are within the same wiki + // Get the expected repository info from the base URL + const baseWikiInfo = this.parseGitHubWikiUrl(options.url); + const expectedWikiPath = `/${baseWikiInfo.owner}/${baseWikiInfo.repo}/wiki`; + + // Check if the URL is within the same wiki if (!parsedUrl.pathname.startsWith(expectedWikiPath)) { return false; } @@ -95,20 +78,27 @@ export class GitHubWikiScraperStrategy extends BaseScraperStrategy { } } - async processItem( + /** + * Processes a single GitHub wiki page. + */ + async process( item: QueueItem, options: ScraperOptions, signal?: AbortSignal, ): Promise { const currentUrl = item.url; - logger.info( - `📖 Processing wiki page ${this.pageCount}/${options.maxPages}: ${currentUrl}`, - ); - try { - // Fetch the wiki page content - const rawContent = await this.httpFetcher.fetch(currentUrl, { signal }); + // Fetch the wiki page content with ETag for conditional requests + const rawContent = await this.httpFetcher.fetch(currentUrl, { + signal, + etag: item.etag, + }); + + // Return the status directly - BaseScraperStrategy handles NOT_MODIFIED and NOT_FOUND + if (rawContent.status !== FetchStatus.SUCCESS) { + return { url: currentUrl, links: [], status: rawContent.status }; + } // Process content through appropriate pipeline let processed: PipelineResult | undefined; @@ -201,34 +191,8 @@ export class GitHubWikiScraperStrategy extends BaseScraperStrategy { } } - async scrape( - options: ScraperOptions, - progressCallback: ProgressCallback, - signal?: AbortSignal, - ): Promise { - // Validate it's a GitHub wiki URL - const url = new URL(options.url); - if (!url.hostname.includes("github.com") || !url.pathname.includes("/wiki")) { - throw new Error("URL must be a GitHub wiki URL"); - } - - // Ensure the starting URL points to the wiki home if no specific page is provided - let startUrl = options.url; - if (url.pathname.endsWith("/wiki") || url.pathname.endsWith("/wiki/")) { - // If the URL just points to /wiki/, start from the Home page - startUrl = url.pathname.endsWith("/") - ? `${options.url}Home` - : `${options.url}/Home`; - } - - // Update options with the corrected start URL - const wikiOptions = { ...options, url: startUrl }; - - return super.scrape(wikiOptions, progressCallback, signal); - } - /** - * Cleanup resources used by this strategy. + * Cleanup resources used by this processor. */ async cleanup(): Promise { await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close())); diff --git a/src/scraper/strategies/GitHubWikiScraperStrategy.test.ts b/src/scraper/strategies/GitHubWikiScraperStrategy.test.ts deleted file mode 100644 index c2ebb366..00000000 --- a/src/scraper/strategies/GitHubWikiScraperStrategy.test.ts +++ /dev/null @@ -1,698 +0,0 @@ -import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; -import { FetchStatus, HttpFetcher } from "../fetcher"; -import type { RawContent } from "../fetcher/types"; -import { HtmlPipeline } from "../pipelines/HtmlPipeline"; -import { MarkdownPipeline } from "../pipelines/MarkdownPipeline"; -import type { PipelineResult } from "../pipelines/types"; -import { ScrapeMode, type ScraperOptions } from "../types"; -import { GitHubWikiScraperStrategy } from "./GitHubWikiScraperStrategy"; - -// Mock the fetcher and pipelines -vi.mock("../fetcher"); -vi.mock("../pipelines/HtmlPipeline"); -vi.mock("../pipelines/MarkdownPipeline"); - -const mockHttpFetcher = vi.mocked(HttpFetcher); -const mockHtmlPipeline = vi.mocked(HtmlPipeline); -const mockMarkdownPipeline = vi.mocked(MarkdownPipeline); - -describe("GitHubWikiScraperStrategy", () => { - let strategy: GitHubWikiScraperStrategy; - let httpFetcherInstance: any; - let htmlPipelineInstance: any; - let markdownPipelineInstance: any; - - beforeEach(() => { - // Reset all mocks - vi.clearAllMocks(); - - // Setup fetcher mock - httpFetcherInstance = { - fetch: vi.fn(), - }; - mockHttpFetcher.mockImplementation(() => httpFetcherInstance); - - // Setup pipeline mocks - htmlPipelineInstance = { - canProcess: vi.fn(), - process: vi.fn(), - }; - markdownPipelineInstance = { - canProcess: vi.fn(), - process: vi.fn(), - }; - mockHtmlPipeline.mockImplementation(() => htmlPipelineInstance); - mockMarkdownPipeline.mockImplementation(() => markdownPipelineInstance); - - strategy = new GitHubWikiScraperStrategy(); - }); - - afterEach(() => { - vi.restoreAllMocks(); - }); - - describe("canHandle", () => { - it("should handle GitHub wiki URLs", () => { - expect(strategy.canHandle("https://github.com/owner/repo/wiki")).toBe(true); - expect(strategy.canHandle("https://github.com/owner/repo/wiki/")).toBe(true); - expect(strategy.canHandle("https://github.com/owner/repo/wiki/Home")).toBe(true); - expect( - strategy.canHandle("https://github.com/owner/repo/wiki/Getting-Started"), - ).toBe(true); - expect(strategy.canHandle("https://www.github.com/owner/repo/wiki/API")).toBe(true); - }); - - it("should not handle non-wiki GitHub URLs", () => { - expect(strategy.canHandle("https://github.com/owner/repo")).toBe(false); - expect(strategy.canHandle("https://github.com/owner/repo/tree/main")).toBe(false); - expect( - strategy.canHandle("https://github.com/owner/repo/blob/main/README.md"), - ).toBe(false); - expect(strategy.canHandle("https://github.com/owner/repo/issues")).toBe(false); - }); - - it("should not handle non-GitHub URLs", () => { - expect(strategy.canHandle("https://example.com/wiki")).toBe(false); - expect(strategy.canHandle("https://gitlab.com/owner/repo/wiki")).toBe(false); - expect(strategy.canHandle("https://bitbucket.org/owner/repo/wiki")).toBe(false); - }); - - it("should handle malformed URLs gracefully", () => { - expect(strategy.canHandle("invalid-url")).toBe(false); - expect(strategy.canHandle("")).toBe(false); - expect(strategy.canHandle("not-a-url-at-all")).toBe(false); - }); - }); - - describe("parseGitHubWikiUrl", () => { - it("should parse basic wiki URL", () => { - const result = strategy.parseGitHubWikiUrl("https://github.com/owner/repo/wiki"); - expect(result).toEqual({ - owner: "owner", - repo: "repo", - }); - }); - - it("should parse wiki URL with trailing slash", () => { - const result = strategy.parseGitHubWikiUrl("https://github.com/owner/repo/wiki/"); - expect(result).toEqual({ - owner: "owner", - repo: "repo", - }); - }); - - it("should parse wiki URL with specific page", () => { - const result = strategy.parseGitHubWikiUrl( - "https://github.com/owner/repo/wiki/Home", - ); - expect(result).toEqual({ - owner: "owner", - repo: "repo", - }); - }); - - it("should parse wiki URL with complex page name", () => { - const result = strategy.parseGitHubWikiUrl( - "https://github.com/owner/repo/wiki/Getting-Started-Guide", - ); - expect(result).toEqual({ - owner: "owner", - repo: "repo", - }); - }); - - it("should handle www subdomain", () => { - const result = strategy.parseGitHubWikiUrl( - "https://www.github.com/owner/repo/wiki", - ); - expect(result).toEqual({ - owner: "owner", - repo: "repo", - }); - }); - - it("should throw error for invalid wiki URL", () => { - expect(() => { - strategy.parseGitHubWikiUrl("https://github.com/invalid"); - }).toThrow("Invalid GitHub wiki URL"); - - expect(() => { - strategy.parseGitHubWikiUrl("https://github.com/owner/repo"); - }).toThrow("Invalid GitHub wiki URL"); - }); - }); - - describe("shouldProcessUrl", () => { - const options: ScraperOptions = { - url: "https://github.com/owner/repo/wiki", - library: "test-lib", - version: "1.0.0", - }; - - it("should process URLs within the same wiki", () => { - expect( - // @ts-expect-error - testing internal method - strategy.shouldProcessUrl("https://github.com/owner/repo/wiki/Home", options), - ).toBe(true); - expect( - // @ts-expect-error - testing internal method - strategy.shouldProcessUrl("https://github.com/owner/repo/wiki/API", options), - ).toBe(true); - expect( - // @ts-expect-error - testing internal method - strategy.shouldProcessUrl( - "https://github.com/owner/repo/wiki/Getting-Started", - options, - ), - ).toBe(true); - }); - - it("should not process URLs outside the wiki", () => { - expect( - // @ts-expect-error - testing internal method - strategy.shouldProcessUrl("https://github.com/owner/repo", options), - ).toBe(false); - expect( - // @ts-expect-error - testing internal method - strategy.shouldProcessUrl("https://github.com/owner/repo/tree/main", options), - ).toBe(false); - expect( - // @ts-expect-error - testing internal method - strategy.shouldProcessUrl("https://github.com/other/repo/wiki/Home", options), - ).toBe(false); - }); - - it("should respect include patterns", () => { - const optionsWithInclude = { - ...options, - includePatterns: ["API*", "Getting*"], - }; - - expect( - // @ts-expect-error - testing internal method - strategy.shouldProcessUrl( - "https://github.com/owner/repo/wiki/API-Reference", - optionsWithInclude, - ), - ).toBe(true); - expect( - // @ts-expect-error - testing internal method - strategy.shouldProcessUrl( - "https://github.com/owner/repo/wiki/Getting-Started", - optionsWithInclude, - ), - ).toBe(true); - expect( - // @ts-expect-error - testing internal method - strategy.shouldProcessUrl( - "https://github.com/owner/repo/wiki/Home", - optionsWithInclude, - ), - ).toBe(false); - }); - - it("should respect exclude patterns", () => { - const optionsWithExclude = { - ...options, - excludePatterns: ["*deprecated*", "old-*"], - }; - - expect( - // @ts-expect-error - testing internal method - strategy.shouldProcessUrl( - "https://github.com/owner/repo/wiki/deprecated-api", - optionsWithExclude, - ), - ).toBe(false); - expect( - // @ts-expect-error - testing internal method - strategy.shouldProcessUrl( - "https://github.com/owner/repo/wiki/old-guide", - optionsWithExclude, - ), - ).toBe(false); - expect( - // @ts-expect-error - testing internal method - strategy.shouldProcessUrl( - "https://github.com/owner/repo/wiki/current-guide", - optionsWithExclude, - ), - ).toBe(true); - }); - - it("should handle Home page as default", () => { - expect( - // @ts-expect-error - testing internal method - strategy.shouldProcessUrl("https://github.com/owner/repo/wiki", options), - ).toBe(true); - expect( - // @ts-expect-error - testing internal method - strategy.shouldProcessUrl("https://github.com/owner/repo/wiki/", options), - ).toBe(true); - }); - - it("should handle malformed URLs gracefully", () => { - // @ts-expect-error - testing internal method - expect(strategy.shouldProcessUrl("invalid-url", options)).toBe(false); - // @ts-expect-error - testing internal method - expect(strategy.shouldProcessUrl("", options)).toBe(false); - }); - }); - - describe("processItem", () => { - const options: ScraperOptions = { - url: "https://github.com/owner/repo/wiki", - library: "test-lib", - version: "1.0.0", - }; - - it("should process wiki page and return document with links", async () => { - const rawContent: RawContent = { - content: ` - - - Wiki Home - -

Welcome to the Wiki

-

This is the home page of our documentation.

- - - - `, - mimeType: "text/html", - source: "https://github.com/owner/repo/wiki/Home", - charset: "utf-8", - status: FetchStatus.SUCCESS, - }; - - const processedContent: PipelineResult = { - textContent: - "Wiki Home\n\nWelcome to the Wiki\n\nThis is the home page of our documentation.", - title: "Wiki Home", - chunks: [], - errors: [], - links: [ - "/owner/repo/wiki/API", - "/owner/repo/wiki/Getting-Started", - "https://external.com", - ], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(true); - htmlPipelineInstance.process.mockResolvedValue(processedContent); - - const item = { url: "https://github.com/owner/repo/wiki/Home", depth: 1 }; - const result = await strategy.processItem(item, options); - - expect(result.content).toEqual({ - textContent: - "Wiki Home\n\nWelcome to the Wiki\n\nThis is the home page of our documentation.", - title: "Wiki Home", - chunks: expect.any(Array), - links: expect.any(Array), - errors: expect.any(Array), - }); - - // Should only include wiki links, not external links - expect(result.links).toEqual([ - "https://github.com/owner/repo/wiki/API", - "https://github.com/owner/repo/wiki/Getting-Started", - ]); - }); - - it("should use page name as title fallback when no title found", async () => { - const rawContent: RawContent = { - content: "

Content without title

", - mimeType: "text/html", - source: "https://github.com/owner/repo/wiki/Getting-Started", - charset: "utf-8", - status: FetchStatus.SUCCESS, - }; - - const processedContent: PipelineResult = { - textContent: "Content without title", - chunks: [], - errors: [], - links: [], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(true); - htmlPipelineInstance.process.mockResolvedValue(processedContent); - - const item = { - url: "https://github.com/owner/repo/wiki/Getting-Started", - depth: 1, - }; - const result = await strategy.processItem(item, options); - - expect(result.title).toBe("Getting-Started"); - }); - - it("should handle Home page title fallback", async () => { - const rawContent: RawContent = { - content: "

Home page content

", - mimeType: "text/html", - source: "https://github.com/owner/repo/wiki", - charset: "utf-8", - status: FetchStatus.SUCCESS, - }; - - const processedContent: PipelineResult = { - textContent: "Home page content", - chunks: [], - errors: [], - links: [], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(true); - htmlPipelineInstance.process.mockResolvedValue(processedContent); - - const item = { url: "https://github.com/owner/repo/wiki", depth: 1 }; - const result = await strategy.processItem(item, options); - - expect(result.title).toBe("Home"); - }); - - it("should force ScrapeMode.Fetch for consistent behavior", async () => { - const rawContent: RawContent = { - content: "

Test

", - mimeType: "text/html", - source: "https://github.com/owner/repo/wiki/Test", - charset: "utf-8", - status: FetchStatus.SUCCESS, - }; - - const processedContent: PipelineResult = { - textContent: "Test", - title: "Test", - chunks: [], - errors: [], - links: [], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(true); - htmlPipelineInstance.process.mockImplementation( - async (_content: any, opts: any) => { - expect(opts.scrapeMode).toBe("fetch"); - return processedContent; - }, - ); - - const optionsWithPlaywright = { - ...options, - scrapeMode: ScrapeMode.Playwright, - }; - - const item = { url: "https://github.com/owner/repo/wiki/Test", depth: 1 }; - await strategy.processItem(item, optionsWithPlaywright); - - expect(htmlPipelineInstance.process).toHaveBeenCalledWith( - rawContent, - expect.objectContaining({ scrapeMode: "fetch" }), - expect.any(Object), - ); - }); - - it("should handle unsupported content types", async () => { - const rawContent: RawContent = { - content: "binary content", - mimeType: "application/octet-stream", - source: "https://github.com/owner/repo/wiki/Binary", - charset: "utf-8", - status: FetchStatus.SUCCESS, - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(false); - markdownPipelineInstance.canProcess.mockReturnValue(false); - - const item = { url: "https://github.com/owner/repo/wiki/Binary", depth: 1 }; - const result = await strategy.processItem(item, options); - - expect(result.content).toBeUndefined(); - expect(result.links).toEqual([]); - }); - - it("should handle fetch errors gracefully", async () => { - httpFetcherInstance.fetch.mockRejectedValue(new Error("Network error")); - - const item = { url: "https://github.com/owner/repo/wiki/Unreachable", depth: 1 }; - const result = await strategy.processItem(item, options); - - expect(result.content).toBeUndefined(); - expect(result.links).toEqual([]); - }); - - it("should handle processing errors from pipelines", async () => { - const rawContent: RawContent = { - content: "

Test

", - mimeType: "text/html", - source: "https://github.com/owner/repo/wiki/Test", - charset: "utf-8", - status: FetchStatus.SUCCESS, - }; - - const processedContentWithErrors = { - textContent: "Test", - metadata: { title: "Test" }, - chunks: [], - errors: [new Error("Processing warning")], - links: [], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(true); - htmlPipelineInstance.process.mockResolvedValue(processedContentWithErrors); - - const item = { url: "https://github.com/owner/repo/wiki/Test", depth: 1 }; - const result = await strategy.processItem(item, options); - - expect(result.content).toBeDefined(); - expect(result.content?.textContent).toBe("Test"); - }); - }); - - describe("scrape", () => { - it("should validate GitHub wiki URL", async () => { - const invalidOptions: ScraperOptions = { - url: "https://example.com/wiki", - library: "test-lib", - version: "1.0.0", - }; - - await expect(strategy.scrape(invalidOptions, vi.fn())).rejects.toThrow( - "URL must be a GitHub wiki URL", - ); - }); - - it("should validate GitHub URL without wiki path", async () => { - const invalidOptions: ScraperOptions = { - url: "https://github.com/owner/repo", - library: "test-lib", - version: "1.0.0", - }; - - await expect(strategy.scrape(invalidOptions, vi.fn())).rejects.toThrow( - "URL must be a GitHub wiki URL", - ); - }); - - it("should append /Home to bare wiki URLs", async () => { - const options: ScraperOptions = { - url: "https://github.com/owner/repo/wiki", - library: "test-lib", - version: "1.0.0", - }; - - // Mock super.scrape to capture the options passed to it - const superScrapeSpy = vi.spyOn( - Object.getPrototypeOf(Object.getPrototypeOf(strategy)), - "scrape", - ); - superScrapeSpy.mockResolvedValue(undefined); - - await strategy.scrape(options, vi.fn()); - - expect(superScrapeSpy).toHaveBeenCalledWith( - expect.objectContaining({ - url: "https://github.com/owner/repo/wiki/Home", - }), - expect.any(Function), - undefined, - ); - - superScrapeSpy.mockRestore(); - }); - - it("should append /Home to wiki URLs with trailing slash", async () => { - const options: ScraperOptions = { - url: "https://github.com/owner/repo/wiki/", - library: "test-lib", - version: "1.0.0", - }; - - const superScrapeSpy = vi.spyOn( - Object.getPrototypeOf(Object.getPrototypeOf(strategy)), - "scrape", - ); - superScrapeSpy.mockResolvedValue(undefined); - - await strategy.scrape(options, vi.fn()); - - expect(superScrapeSpy).toHaveBeenCalledWith( - expect.objectContaining({ - url: "https://github.com/owner/repo/wiki/Home", - }), - expect.any(Function), - undefined, - ); - - superScrapeSpy.mockRestore(); - }); - - it("should not modify URLs that already point to specific pages", async () => { - const options: ScraperOptions = { - url: "https://github.com/owner/repo/wiki/Getting-Started", - library: "test-lib", - version: "1.0.0", - }; - - const superScrapeSpy = vi.spyOn( - Object.getPrototypeOf(Object.getPrototypeOf(strategy)), - "scrape", - ); - superScrapeSpy.mockResolvedValue(undefined); - - await strategy.scrape(options, vi.fn()); - - expect(superScrapeSpy).toHaveBeenCalledWith( - expect.objectContaining({ - url: "https://github.com/owner/repo/wiki/Getting-Started", - }), - expect.any(Function), - undefined, - ); - - superScrapeSpy.mockRestore(); - }); - }); - - describe("Link filtering and URL normalization", () => { - const options: ScraperOptions = { - url: "https://github.com/owner/repo/wiki", - library: "test-lib", - version: "1.0.0", - }; - - it("should convert relative links to absolute URLs", async () => { - const rawContent: RawContent = { - content: ` - - API Docs - Getting Started - Advanced - - `, - mimeType: "text/html", - source: "https://github.com/owner/repo/wiki/Home", - charset: "utf-8", - status: FetchStatus.SUCCESS, - }; - - const processedContent: PipelineResult = { - textContent: "Content", - title: "Test", - chunks: [], - errors: [], - links: ["/owner/repo/wiki/API", "Getting-Started", "./Advanced-Topics"], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(true); - htmlPipelineInstance.process.mockResolvedValue(processedContent); - - const item = { url: "https://github.com/owner/repo/wiki/Home", depth: 1 }; - const result = await strategy.processItem(item, options); - - expect(result.links).toEqual([ - "https://github.com/owner/repo/wiki/API", - "https://github.com/owner/repo/wiki/Getting-Started", - "https://github.com/owner/repo/wiki/Advanced-Topics", - ]); - }); - - it("should filter out non-wiki links", async () => { - const rawContent: RawContent = { - content: "Content", - mimeType: "text/html", - source: "https://github.com/owner/repo/wiki/Home", - charset: "utf-8", - status: FetchStatus.SUCCESS, - }; - - const processedContent: PipelineResult = { - textContent: "Content", - title: "Test", - chunks: [], - errors: [], - links: [ - "https://github.com/owner/repo/wiki/API", // Should include - "https://github.com/owner/repo", // Should exclude (not wiki) - "https://github.com/other/repo/wiki/Home", // Should exclude (different repo) - "https://external.com/wiki", // Should exclude (external domain) - "mailto:test@example.com", // Should exclude (different protocol) - ], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(true); - htmlPipelineInstance.process.mockResolvedValue(processedContent); - - const item = { url: "https://github.com/owner/repo/wiki/Home", depth: 1 }; - const result = await strategy.processItem(item, options); - - expect(result.links).toEqual(["https://github.com/owner/repo/wiki/API"]); - }); - - it("should handle malformed URLs in links gracefully", async () => { - const rawContent: RawContent = { - content: "Content", - mimeType: "text/html", - source: "https://github.com/owner/repo/wiki/Home", - charset: "utf-8", - status: FetchStatus.SUCCESS, - }; - - const processedContent: PipelineResult = { - textContent: "Content", - title: "Test", - chunks: [], - errors: [], - links: [ - "invalid-url", - "https://github.com/owner/repo/wiki/Valid", - "", - "not-a-url-at-all", - ], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(true); - htmlPipelineInstance.process.mockResolvedValue(processedContent); - - const item = { url: "https://github.com/owner/repo/wiki/Home", depth: 1 }; - const result = await strategy.processItem(item, options); - - // Should only include the valid wiki link - expect(result.links).toEqual(["https://github.com/owner/repo/wiki/Valid"]); - }); - }); -}); diff --git a/src/scraper/strategies/LocalFileStrategy.test.ts b/src/scraper/strategies/LocalFileStrategy.test.ts index 58d70195..dda0b5d6 100644 --- a/src/scraper/strategies/LocalFileStrategy.test.ts +++ b/src/scraper/strategies/LocalFileStrategy.test.ts @@ -585,9 +585,13 @@ describe("LocalFileStrategy", () => { const progressCallback = vi.fn>(); const testContent = "# Test File\nOriginal content"; - // Create initial file + // Create initial file with a specific mtime vol.fromJSON({ "/test.md": testContent }, "/"); + // Get the file stats to capture the exact mtime + const stats = await vol.promises.stat("/test.md"); + const initialMtime = stats.mtime; + // First scrape to get the initial etag const initialOptions: ScraperOptions = { url: "file:///test.md", @@ -604,7 +608,12 @@ describe("LocalFileStrategy", () => { const firstCall = progressCallback.mock.calls[0][0]; const etag = firstCall.result?.etag; - // Reset the callback + // Verify the mtime hasn't changed + const statsAfterScrape = await vol.promises.stat("/test.md"); + expect(statsAfterScrape.mtime.getTime()).toBe(initialMtime.getTime()); + + // Reset the callback but DON'T reset the filesystem + // This preserves the file's mtime, so the etag stays the same progressCallback.mockClear(); // Now do a refresh with the same etag (file unchanged) @@ -626,9 +635,18 @@ describe("LocalFileStrategy", () => { await strategy.scrape(refreshOptions, progressCallback); - // Verify no documents were processed (file unchanged) - const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); - expect(docCalls).toHaveLength(0); + // Verify file was checked but returned NOT_MODIFIED (no result with content) + // The root URL at depth 0 is always processed to check for changes + expect(progressCallback).toHaveBeenCalledTimes(1); + expect(progressCallback).toHaveBeenCalledWith( + expect.objectContaining({ + pagesScraped: 1, + currentUrl: "file:///test.md", + depth: 0, + result: null, // NOT_MODIFIED returns null result + pageId: 123, + }), + ); }); it("should re-process file when it has been modified", async () => { @@ -795,56 +813,56 @@ describe("LocalFileStrategy", () => { expect(calledUrls).toContain("file:///testdir/file2.md"); }); - it("should preserve depth from original scrape during refresh", async () => { + it("should preserve depth from original scrape during refresh for nested files", async () => { const strategy = new LocalFileStrategy(); const progressCallback = vi.fn>(); - const testContent = "# Deep File\nContent at depth 2"; vol.fromJSON( { - "/deep/file.md": testContent, + "/testdir/subdir/deep/file.md": "# Deep File\nOriginal content", }, "/", ); - // First scrape to get etag + // First scrape starting from directory - file will be discovered at depth 3 const initialOptions: ScraperOptions = { - url: "file:///deep/file.md", + url: "file:///testdir", library: "test", version: "1.0", - maxPages: 1, - maxDepth: 2, + maxPages: 10, + maxDepth: 3, }; await strategy.scrape(initialOptions, progressCallback); + expect(progressCallback).toHaveBeenCalledTimes(1); const firstCall = progressCallback.mock.calls[0][0]; + expect(firstCall.depth).toBe(3); // File discovered at depth 3 const etag = firstCall.result?.etag; // Update the file with new content vol.reset(); vol.fromJSON( { - "/deep/file.md": "# Deep File\nUpdated content", + "/testdir/subdir/deep/file.md": "# Deep File\nUpdated content", }, "/", ); - // Wait a bit to ensure different mtime await new Promise((resolve) => setTimeout(resolve, 10)); progressCallback.mockClear(); - // Refresh with original depth + // Refresh starting from same directory with file in initialQueue at depth 3 const refreshOptions: ScraperOptions = { - url: "file:///deep/file.md", + url: "file:///testdir", library: "test", version: "1.0", - maxPages: 1, - maxDepth: 2, + maxPages: 10, + maxDepth: 3, initialQueue: [ { - url: "file:///deep/file.md", - depth: 2, // Original depth preserved + url: "file:///testdir/subdir/deep/file.md", + depth: 3, // Original depth from discovery pageId: 555, etag: etag, }, @@ -853,10 +871,12 @@ describe("LocalFileStrategy", () => { await strategy.scrape(refreshOptions, progressCallback); - // Verify depth is preserved + // Verify file was re-processed and depth from initialQueue is preserved const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); expect(docCalls).toHaveLength(1); - expect(docCalls[0][0].depth).toBe(2); + expect(docCalls[0][0].depth).toBe(3); + expect(docCalls[0][0].pageId).toBe(555); + expect(docCalls[0][0].result?.textContent).toContain("Updated content"); }); }); }); diff --git a/src/scraper/strategies/LocalFileStrategy.ts b/src/scraper/strategies/LocalFileStrategy.ts index 723a8cef..d523a67f 100644 --- a/src/scraper/strategies/LocalFileStrategy.ts +++ b/src/scraper/strategies/LocalFileStrategy.ts @@ -67,8 +67,6 @@ export class LocalFileStrategy extends BaseScraperStrategy { return { url: item.url, links, status: FetchStatus.SUCCESS }; } - logger.info(`🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`); - const rawContent: RawContent = await this.fileFetcher.fetch(item.url, { etag: item.etag, }); diff --git a/src/scraper/strategies/WebScraperStrategy.test.ts b/src/scraper/strategies/WebScraperStrategy.test.ts index 8cc8cdaf..44461660 100644 --- a/src/scraper/strategies/WebScraperStrategy.test.ts +++ b/src/scraper/strategies/WebScraperStrategy.test.ts @@ -1020,23 +1020,35 @@ describe("WebScraperStrategy", () => { it("should refresh page content when page returns 200 OK", async () => { const progressCallback = vi.fn>(); + const rootContent = + "Root

Root

"; const updatedContent = "Updated

New Content

"; - // Configure mock to return 200 with new content - mockFetchFn.mockResolvedValue({ - content: updatedContent, - mimeType: "text/html", - source: "https://example.com/updated-page", - status: FetchStatus.SUCCESS, - etag: "new-etag", + // Configure mock to return different content for root vs updated page + mockFetchFn.mockImplementation(async (url: string) => { + if (url === "https://example.com") { + return { + content: rootContent, + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + }; + } + return { + content: updatedContent, + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + etag: "new-etag", + }; }); // Create a queue item with pageId and etag (refresh operation) options.initialQueue = [ { url: "https://example.com/updated-page", - depth: 0, + depth: 1, pageId: 789, etag: "old-etag", }, @@ -1044,7 +1056,8 @@ describe("WebScraperStrategy", () => { await strategy.scrape(options, progressCallback); - // Verify fetch was called with old etag + // Verify fetch was called for both root and updated page + expect(mockFetchFn).toHaveBeenCalledWith("https://example.com", expect.anything()); expect(mockFetchFn).toHaveBeenCalledWith( "https://example.com/updated-page", expect.objectContaining({ @@ -1052,16 +1065,24 @@ describe("WebScraperStrategy", () => { }), ); - // Verify new content was processed + // Verify both pages were processed (root at depth 0, updated page at depth 1) const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); - expect(docCalls).toHaveLength(1); - expect(docCalls[0][0].result?.textContent).toContain("# New Content"); - expect(docCalls[0][0].result?.title).toBe("Updated"); - expect(docCalls[0][0].result?.etag).toBe("new-etag"); + expect(docCalls).toHaveLength(2); + + // Find the updated page call + const updatedPageCall = docCalls.find( + (call) => call[0].currentUrl === "https://example.com/updated-page", + ); + expect(updatedPageCall).toBeDefined(); + expect(updatedPageCall![0].result?.textContent).toContain("# New Content"); + expect(updatedPageCall![0].result?.title).toBe("Updated"); + expect(updatedPageCall![0].result?.etag).toBe("new-etag"); }); - it("should not follow links during refresh operations", async () => { + it("should discover and follow new links during refresh operations", async () => { const progressCallback = vi.fn>(); + const rootContent = + "Root

Root

"; const contentWithLinks = ` Refreshed Page @@ -1073,20 +1094,30 @@ describe("WebScraperStrategy", () => { `; - // Configure mock to return 200 with new links - mockFetchFn.mockResolvedValue({ - content: contentWithLinks, - mimeType: "text/html", - source: "https://example.com/page-with-links", - status: FetchStatus.SUCCESS, - etag: "new-etag", + // Configure mock to return different content for root vs page + mockFetchFn.mockImplementation(async (url: string) => { + if (url === "https://example.com") { + return { + content: rootContent, + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + }; + } + return { + content: contentWithLinks, + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + etag: "new-etag", + }; }); // Create a queue item with pageId and etag (refresh operation) options.initialQueue = [ { url: "https://example.com/page-with-links", - depth: 0, + depth: 1, pageId: 999, etag: "old-etag", }, @@ -1094,19 +1125,21 @@ describe("WebScraperStrategy", () => { await strategy.scrape(options, progressCallback); - // Verify only the initial page was fetched (no link following) - expect(mockFetchFn).toHaveBeenCalledTimes(1); + // Verify root, refresh page, and discovered links were all fetched + // Root (depth 0) + refresh page (depth 1) + 2 new links (depth 2) = 4 total + expect(mockFetchFn).toHaveBeenCalledTimes(4); + expect(mockFetchFn).toHaveBeenCalledWith("https://example.com", expect.anything()); expect(mockFetchFn).toHaveBeenCalledWith( "https://example.com/page-with-links", expect.anything(), ); - // Verify the new links were not followed - expect(mockFetchFn).not.toHaveBeenCalledWith( + // Verify the new links discovered during refresh WERE followed (this is correct behavior) + expect(mockFetchFn).toHaveBeenCalledWith( "https://example.com/new-link", expect.anything(), ); - expect(mockFetchFn).not.toHaveBeenCalledWith( + expect(mockFetchFn).toHaveBeenCalledWith( "https://example.com/another-new-link", expect.anything(), ); @@ -1151,23 +1184,23 @@ describe("WebScraperStrategy", () => { }; }); - // Create a queue with multiple pages + // Create a queue with multiple pages (all at depth > 0 to avoid root URL processing) options.initialQueue = [ { url: "https://example.com/unchanged", - depth: 0, + depth: 1, pageId: 1, etag: "etag-1", }, { url: "https://example.com/deleted", - depth: 0, + depth: 1, pageId: 2, etag: "etag-2", }, { url: "https://example.com/updated", - depth: 0, + depth: 1, pageId: 3, etag: "etag-3", }, @@ -1175,26 +1208,43 @@ describe("WebScraperStrategy", () => { await strategy.scrape(options, progressCallback); - // Verify all three pages were fetched - expect(mockFetchFn).toHaveBeenCalledTimes(3); + // Verify all three pages plus root were fetched (4 total) + expect(mockFetchFn).toHaveBeenCalledTimes(4); - // Verify only the updated page produced a processed document + // Verify root was processed + only the updated page produced a processed document (2 total) const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); - expect(docCalls).toHaveLength(1); - expect(docCalls[0][0].result?.url).toBe("https://example.com/updated"); - expect(docCalls[0][0].result?.title).toBe("Updated"); + expect(docCalls).toHaveLength(2); + + // Find the updated page (not the root) + const updatedPageCall = docCalls.find( + (call) => call[0].currentUrl === "https://example.com/updated", + ); + expect(updatedPageCall).toBeDefined(); + expect(updatedPageCall![0].result?.url).toBe("https://example.com/updated"); + expect(updatedPageCall![0].result?.title).toBe("Updated"); }); it("should preserve depth from original scrape during refresh", async () => { const progressCallback = vi.fn>(); - mockFetchFn.mockResolvedValue({ - content: - "Depth Test

Content

", - mimeType: "text/html", - source: "https://example.com/deep-page", - status: FetchStatus.SUCCESS, - etag: "new-etag", + mockFetchFn.mockImplementation(async (url: string) => { + if (url === "https://example.com") { + return { + content: + "Root

Root

", + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + }; + } + return { + content: + "Depth Test

Content

", + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + etag: "new-etag", + }; }); // Create a queue item with depth from original scrape @@ -1209,10 +1259,17 @@ describe("WebScraperStrategy", () => { await strategy.scrape(options, progressCallback); - // Verify the processed document preserves the original depth + // Verify both root and deep page were processed (2 documents) const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); - expect(docCalls).toHaveLength(1); - expect(docCalls[0][0].depth).toBe(2); + expect(docCalls).toHaveLength(2); + + // Find the deep page and verify it preserved its depth + const deepPageCall = docCalls.find( + (call) => call[0].currentUrl === "https://example.com/deep-page", + ); + expect(deepPageCall).toBeDefined(); + expect(deepPageCall![0].depth).toBe(2); + expect(deepPageCall![0].pageId).toBe(555); }); }); }); diff --git a/src/scraper/strategies/WebScraperStrategy.ts b/src/scraper/strategies/WebScraperStrategy.ts index 4fa4e9ed..d8558b07 100644 --- a/src/scraper/strategies/WebScraperStrategy.ts +++ b/src/scraper/strategies/WebScraperStrategy.ts @@ -51,6 +51,11 @@ export class WebScraperStrategy extends BaseScraperStrategy { const { url } = item; try { + // Log when processing with ETag for conditional requests + if (item.etag) { + logger.debug(`Processing ${url} with stored ETag: ${item.etag}`); + } + // Define fetch options, passing signal, followRedirects, headers, and etag const fetchOptions = { signal, @@ -62,8 +67,13 @@ export class WebScraperStrategy extends BaseScraperStrategy { // Use AutoDetectFetcher which handles fallbacks automatically const rawContent: RawContent = await this.fetcher.fetch(url, fetchOptions); + logger.debug( + `Fetch result for ${url}: status=${rawContent.status}, etag=${rawContent.etag || "none"}`, + ); + // Return the status directly - BaseScraperStrategy handles NOT_MODIFIED and NOT_FOUND if (rawContent.status !== FetchStatus.SUCCESS) { + logger.debug(`Skipping pipeline for ${url} due to status: ${rawContent.status}`); return { url, links: [], status: rawContent.status }; } diff --git a/src/services/workerService.ts b/src/services/workerService.ts index 1626b939..4f7e843b 100644 --- a/src/services/workerService.ts +++ b/src/services/workerService.ts @@ -47,7 +47,7 @@ export async function registerWorkerService(pipeline: IPipeline): Promise }, onJobError: async (job, error, document) => { logger.warn( - `⚠️ Job ${job.id} error ${document ? `on document ${document.metadata.url}` : ""}: ${error.message}`, + `⚠️ Job ${job.id} error ${document ? `on document ${document.url}` : ""}: ${error.message}`, ); // Use PostHog's native error tracking instead of custom events diff --git a/src/store/DocumentManagementService.test.ts b/src/store/DocumentManagementService.test.ts index 6095bd18..881be474 100644 --- a/src/store/DocumentManagementService.test.ts +++ b/src/store/DocumentManagementService.test.ts @@ -33,7 +33,7 @@ const mockStore = { checkDocumentExists: vi.fn(), queryLibraryVersions: vi.fn().mockResolvedValue(new Map()), addDocuments: vi.fn(), - deleteDocuments: vi.fn(), + deletePages: vi.fn(), // Status tracking methods updateVersionStatus: vi.fn(), updateVersionProgress: vi.fn(), @@ -270,17 +270,17 @@ describe("DocumentManagementService", () => { const version = "1.0.0"; await docService.removeAllDocuments(library, version); - expect(mockStore.deleteDocuments).toHaveBeenCalledWith(library, version); // Fix: Use mockStoreInstance + expect(mockStore.deletePages).toHaveBeenCalledWith(library, version); // Fix: Use mockStoreInstance }); it("should handle removing documents with null/undefined/empty version", async () => { const library = "test-lib"; await docService.removeAllDocuments(library, null); - expect(mockStore.deleteDocuments).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance + expect(mockStore.deletePages).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance await docService.removeAllDocuments(library, undefined); - expect(mockStore.deleteDocuments).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance + expect(mockStore.deletePages).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance await docService.removeAllDocuments(library, ""); - expect(mockStore.deleteDocuments).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance + expect(mockStore.deletePages).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance }); describe("listVersions", () => { diff --git a/src/store/DocumentManagementService.ts b/src/store/DocumentManagementService.ts index 583dd917..cfdfe671 100644 --- a/src/store/DocumentManagementService.ts +++ b/src/store/DocumentManagementService.ts @@ -334,19 +334,17 @@ export class DocumentManagementService { logger.info( `🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`, ); - const count = await this.store.deleteDocuments(library, normalizedVersion); + const count = await this.store.deletePages(library, normalizedVersion); logger.info(`🗑️ Deleted ${count} documents`); } /** - * Removes all documents for a specific page ID. - * This is more efficient than URL-based deletion when the page ID is known. + * Deletes a page and all its associated document chunks. + * This is used during refresh operations when a page returns 404 Not Found. */ - async removeDocumentsByPageId(pageId: number): Promise { - logger.debug(`Removing documents for page ID: ${pageId}`); - const count = await this.store.deleteDocumentsByPageId(pageId); - logger.info(`🗑️ Deleted ${count} documents`); - return count; + async deletePage(pageId: number): Promise { + logger.debug(`Deleting page ID: ${pageId}`); + await this.store.deletePage(pageId); } /** diff --git a/src/store/DocumentStore.test.ts b/src/store/DocumentStore.test.ts index 28a93355..fc045bc5 100644 --- a/src/store/DocumentStore.test.ts +++ b/src/store/DocumentStore.test.ts @@ -188,7 +188,7 @@ describe("DocumentStore - With Embeddings", () => { ); expect(await store.checkDocumentExists("templib", "1.0.0")).toBe(true); - const deletedCount = await store.deleteDocuments("templib", "1.0.0"); + const deletedCount = await store.deletePages("templib", "1.0.0"); expect(deletedCount).toBe(1); expect(await store.checkDocumentExists("templib", "1.0.0")).toBe(false); }); @@ -769,6 +769,44 @@ describe("DocumentStore - Common Functionality", () => { }); describe("Document Management", () => { + it("should delete both documents and pages when removing all documents", async () => { + const library = "delete-test"; + const version = "1.0.0"; + + // Add multiple pages with documents + await store.addDocuments( + library, + version, + 1, + createScrapeResult("Page 1", "https://example.com/page1", "Content for page 1", [ + "section1", + ]), + ); + await store.addDocuments( + library, + version, + 1, + createScrapeResult("Page 2", "https://example.com/page2", "Content for page 2", [ + "section2", + ]), + ); + + // Verify both pages and documents exist + const versionId = await store.resolveVersionId(library, version); + const pagesBefore = await store.getPagesByVersionId(versionId); + expect(pagesBefore.length).toBe(2); + expect(await store.checkDocumentExists(library, version)).toBe(true); + + // Delete all documents for this version + const deletedCount = await store.deletePages(library, version); + expect(deletedCount).toBe(2); // Should delete 2 documents + + // Verify both documents AND pages are gone + const pagesAfter = await store.getPagesByVersionId(versionId); + expect(pagesAfter.length).toBe(0); // Pages should be deleted too + expect(await store.checkDocumentExists(library, version)).toBe(false); + }); + it("should retrieve documents by ID", async () => { await store.addDocuments( "idtest", diff --git a/src/store/DocumentStore.ts b/src/store/DocumentStore.ts index 831cdec6..5493213c 100644 --- a/src/store/DocumentStore.ts +++ b/src/store/DocumentStore.ts @@ -78,8 +78,8 @@ export class DocumentStore { >; getPageId: Database.Statement<[number, string]>; deleteDocuments: Database.Statement<[string, string]>; - deleteDocumentsByUrl: Database.Statement<[string, string, string]>; deleteDocumentsByPageId: Database.Statement<[number]>; + deletePage: Database.Statement<[number]>; deletePages: Database.Statement<[string, string]>; queryVersions: Database.Statement<[string]>; checkExists: Database.Statement<[string, string]>; @@ -220,11 +220,11 @@ export class DocumentStore { "SELECT id FROM libraries WHERE name = ?", ), // New version-related statements - insertVersion: this.db.prepare<[number, string | null]>( + insertVersion: this.db.prepare<[number, string]>( "INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING", ), - resolveVersionId: this.db.prepare<[number, string | null]>( - "SELECT id FROM versions WHERE library_id = ? AND name IS ?", + resolveVersionId: this.db.prepare<[number, string]>( + "SELECT id FROM versions WHERE library_id = ? AND name = ?", ), getVersionById: this.db.prepare<[number]>("SELECT * FROM versions WHERE id = ?"), queryVersionsByLibraryId: this.db.prepare<[number]>( @@ -239,18 +239,10 @@ export class DocumentStore { WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '') )`, ), - deleteDocumentsByUrl: this.db.prepare<[string, string, string]>( - `DELETE FROM documents - WHERE page_id IN ( - SELECT p.id FROM pages p - JOIN versions v ON p.version_id = v.id - JOIN libraries l ON v.library_id = l.id - WHERE p.url = ? AND l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '') - )`, - ), deleteDocumentsByPageId: this.db.prepare<[number]>( "DELETE FROM documents WHERE page_id = ?", ), + deletePage: this.db.prepare<[number]>("DELETE FROM pages WHERE id = ?"), deletePages: this.db.prepare<[string, string]>( `DELETE FROM pages WHERE version_id IN ( @@ -602,7 +594,7 @@ export class DocumentStore { this.statements.insertVersion.run(libraryId, normalizedVersion); const versionIdRow = this.statements.resolveVersionId.get( libraryId, - normalizedVersion === null ? "" : normalizedVersion, + normalizedVersion, ) as { id: number } | undefined; if (!versionIdRow || typeof versionIdRow.id !== "number") { throw new StoreError( @@ -687,8 +679,16 @@ export class DocumentStore { */ async storeScraperOptions(versionId: number, options: ScraperOptions): Promise { try { - // biome-ignore lint/correctness/noUnusedVariables: Extract source URL and exclude runtime-only fields using destructuring - const { url: source_url, library, version, signal, ...scraper_options } = options; + // Extract source URL and exclude runtime-only fields using destructuring + const { + url: source_url, + library: _, + version: __, + signal: ___, + initialQueue: ____, + isRefresh: _____, + ...scraper_options + } = options; const optionsJson = JSON.stringify(scraper_options); this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId); @@ -929,10 +929,17 @@ export class DocumentStore { // Resolve library and version IDs (creates them if they don't exist) const versionId = await this.resolveVersionId(library, version); - // Delete existing documents for these URLs to prevent conflicts - const deletedCount = await this.deleteDocumentsByUrl(library, version, url); - if (deletedCount > 0) { - logger.debug(`Deleted ${deletedCount} existing documents for URL: ${url}`); + // Delete existing documents for this page to prevent conflicts + // First check if the page exists and get its ID + const existingPage = this.statements.getPageId.get(versionId, url) as + | { id: number } + | undefined; + + if (existingPage) { + const result = this.statements.deleteDocumentsByPageId.run(existingPage.id); + if (result.changes > 0) { + logger.debug(`Deleted ${result.changes} existing documents for URL: ${url}`); + } } // Insert documents in a transaction @@ -1003,56 +1010,50 @@ export class DocumentStore { } /** - * Removes documents matching specified library and version + * Removes documents and pages matching specified library and version. + * This consolidated method deletes both documents and their associated pages. * @returns Number of documents deleted */ - async deleteDocuments(library: string, version: string): Promise { + async deletePages(library: string, version: string): Promise { try { const normalizedVersion = version.toLowerCase(); + + // First delete documents const result = this.statements.deleteDocuments.run( library.toLowerCase(), normalizedVersion, ); - return result.changes; - } catch (error) { - throw new ConnectionError("Failed to delete documents", error); - } - } - /** - * Removes documents for a specific URL within a library and version - * @returns Number of documents deleted - */ - async deleteDocumentsByUrl( - library: string, - version: string, - url: string, - ): Promise { - try { - const normalizedVersion = version.toLowerCase(); - const result = this.statements.deleteDocumentsByUrl.run( - url, - library.toLowerCase(), - normalizedVersion, - ); + // Then delete the pages (after documents are gone, due to foreign key constraints) + this.statements.deletePages.run(library.toLowerCase(), normalizedVersion); + return result.changes; } catch (error) { - throw new ConnectionError("Failed to delete documents by URL", error); + throw new ConnectionError("Failed to delete documents", error); } } /** - * Removes all documents for a specific page ID. - * This is more efficient than URL-based deletion when the page ID is known. - * @returns Number of documents deleted + * Deletes a page and all its associated document chunks. + * Performs manual deletion in the correct order to satisfy foreign key constraints: + * 1. Delete document chunks (page_id references pages.id) + * 2. Delete page record + * + * This method is used during refresh operations when a page returns 404 Not Found. */ - async deleteDocumentsByPageId(pageId: number): Promise { + async deletePage(pageId: number): Promise { try { - const result = this.statements.deleteDocumentsByPageId.run(pageId); - logger.debug(`Deleted ${result.changes} document(s) for page ID ${pageId}`); - return result.changes; + // Delete documents first (due to foreign key constraint) + const docResult = this.statements.deleteDocumentsByPageId.run(pageId); + logger.debug(`Deleted ${docResult.changes} document(s) for page ID ${pageId}`); + + // Then delete the page record + const pageResult = this.statements.deletePage.run(pageId); + if (pageResult.changes > 0) { + logger.debug(`Deleted page record for page ID ${pageId}`); + } } catch (error) { - throw new ConnectionError("Failed to delete documents by page ID", error); + throw new ConnectionError(`Failed to delete page ${pageId}`, error); } } @@ -1111,7 +1112,7 @@ export class DocumentStore { // 4. libraries (if empty) // Delete all documents for this version - const documentsDeleted = await this.deleteDocuments(library, version); + const documentsDeleted = await this.deletePages(library, version); // Delete all pages for this version (must be done after documents, before version) this.statements.deletePages.run(normalizedLibrary, normalizedVersion); diff --git a/src/store/types.ts b/src/store/types.ts index 6a3b59ee..5a734eb3 100644 --- a/src/store/types.ts +++ b/src/store/types.ts @@ -26,9 +26,9 @@ export interface DbPage { export interface DbChunkMetadata { level?: number; // Hierarchical level in document path?: string[]; // Hierarchical path in document - // TODO: Check if `types` is properly use + // TODO: Check if `types` is properly used types?: string[]; // Types of content in this chunk (e.g., "text", "code", "table") - // FIXME: Enable additional metadata fields again once we have a clear schema for what metadata we want to store with each chunk. + // TODO: Enable additional metadata fields again once we have a clear schema for what metadata we want to store with each chunk. // Allow for additional chunk-specific metadata // [key: string]: unknown; } diff --git a/src/tools/index.ts b/src/tools/index.ts index aa659e6e..8b8b8fbe 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -6,6 +6,7 @@ export * from "./FindVersionTool"; export * from "./GetJobInfoTool"; export * from "./ListJobsTool"; export * from "./ListLibrariesTool"; +export * from "./RefreshVersionTool"; export * from "./RemoveTool"; export * from "./ScrapeTool"; export * from "./SearchTool"; From abda63adba3c9176e9b53cfbaf2d729c2c89db7f Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Tue, 11 Nov 2025 05:23:04 -0800 Subject: [PATCH 09/20] fix(scraper): enhance refresh mode handling and page processing - Added tests for refresh mode with initialQueue to ensure items are prioritized and metadata (pageId, etag) is preserved. - Improved BaseScraperStrategy to utilize final URLs from results for progress callbacks and link resolution. - Updated WebScraperStrategy to return final URLs after redirects for accurate indexing. - Enhanced DocumentStore tests to validate refresh operations, ensuring proper handling of metadata and document existence. - Implemented resiliency tests in the refresh pipeline to handle network timeouts and redirect chains effectively. - Added file-based refresh scenarios to detect changes in file structure, ensuring accurate indexing of new, modified, and deleted files. --- docs/refresh-testing-prd.md | 542 +++++++----------- src/pipeline/PipelineManager.test.ts | 4 +- src/pipeline/PipelineManager.ts | 1 - src/pipeline/PipelineWorker.test.ts | 177 ++++++ src/scraper/fetcher/FileFetcher.test.ts | 166 +++--- src/scraper/fetcher/HttpFetcher.test.ts | 218 +++---- .../strategies/BaseScraperStrategy.test.ts | 476 +++++++++++++++ src/scraper/strategies/BaseScraperStrategy.ts | 31 +- src/scraper/strategies/WebScraperStrategy.ts | 9 +- src/store/DocumentStore.test.ts | 116 +++- test/refresh-pipeline-e2e.test.ts | 326 ++++++++++- 11 files changed, 1517 insertions(+), 549 deletions(-) diff --git a/docs/refresh-testing-prd.md b/docs/refresh-testing-prd.md index dd52ac3b..2e12988e 100644 --- a/docs/refresh-testing-prd.md +++ b/docs/refresh-testing-prd.md @@ -1,430 +1,290 @@ -# Refresh Testing PRD +# Test Refactoring Implementation Plan ## Overview -This document outlines additional test cases needed to ensure comprehensive coverage of the refresh functionality. The focus is on unit tests for specific components and edge cases not covered by existing E2E tests. +This document serves as a comprehensive to-do list for refactoring all unit tests to follow a behavior-driven testing philosophy. The focus is on validating public contracts and observable outcomes, rather than internal implementation details. -## Existing Coverage +## Testing Philosophy -The current `test/refresh-pipeline-e2e.test.ts` covers: +- **Behavior-Driven**: Tests validate the public contract of a component. We test _what_ it does, not _how_ it does it. +- **Consolidate and Elevate**: We favor integration tests that cover a complete workflow over multiple granular unit tests. +- **Clarity of Purpose**: Tests are separated into **Unit/Integration** (verifying component behavior) and **E2E** (verifying complete system workflows). +- **Avoid Implementation Details**: Don't test how something is implemented. Test the observable behavior. -- ✅ Page deletion (404 during refresh) -- ✅ Page updates (200 with new content) -- ✅ Unchanged pages (304 responses) -- ✅ New page discovery during refresh -- ✅ 404 handling during normal scraping +--- -## Proposed Additional Test Coverage +## Implementation Checklist -### 1. BaseScraperStrategy Unit Tests +### Phase 1: DocumentStore (`src/store/DocumentStore.test.ts`) -**File:** `src/scraper/strategies/BaseScraperStrategy.test.ts` (extend existing) +#### Refresh-Related Tests -#### 1.1 Initial Queue Processing +- [x] **REMOVE** - `describe("Refresh Operations - deletePage", ...)` block -```typescript -describe("initialQueue handling", () => { - it("should process all items from initialQueue before discovering new links"); - it("should preserve depth from initialQueue items"); - it("should preserve pageId from initialQueue items"); - it("should preserve etag from initialQueue items"); - it("should deduplicate between initialQueue and root URL"); - it("should handle empty initialQueue gracefully"); -}); -``` + - **Rationale**: This is an implementation detail better tested at the `PipelineWorker` level + - **Files to update**: `src/store/DocumentStore.test.ts` -**Rationale:** The initialQueue is critical for refresh operations but isn't thoroughly tested at the unit level. We need to verify it's properly integrated into the scraping workflow. +- [x] **KEEP** - `describe("Refresh Operations - getPagesByVersionId", ...)` block + - **Rationale**: Tests the public contract for building refresh queues + - **Action**: No changes needed -#### 1.2 Refresh Mode Detection +#### Non-Refresh Tests to Refine -```typescript -describe("refresh mode detection", () => { - it("should detect refresh mode when initialQueue is provided"); - it("should use normal mode when initialQueue is empty"); - it("should correctly calculate effectiveTotal with initialQueue"); - it("should correctly track totalDiscovered with initialQueue"); -}); -``` +- [x] **REFINED** - "Embedding Batch Processing" tests -**Rationale:** The strategy behaves differently in refresh mode. We should verify this detection logic works correctly. + - **Action**: Refactored to test observable behavior (documents are successfully embedded and searchable) rather than implementation details (exact batch sizes) + - **Changes made**: Replaced test that checked exact batch counts with test that verifies all documents are embedded and searchable + - **Files updated**: `src/store/DocumentStore.test.ts` + - **Status**: All 29 tests passing -#### 1.3 Root URL Handling in Refresh +- [x] **KEPT** - "Hybrid Search" and "FTS-only Search" tests -```typescript -describe("root URL handling during refresh", () => { - it("should process root URL even if it appears in initialQueue"); - it("should not duplicate root URL if already in initialQueue"); - it("should use etag from initialQueue for root URL if available"); - it("should add root URL at depth 0 if not in initialQueue"); -}); -``` + - **Rationale**: These test the quality and correctness of search results (observable behavior) + - **Status**: No changes needed -**Rationale:** Root URL handling has special logic that needs validation to ensure it's always processed exactly once. +- [x] **KEPT** - Core contract tests (storage, retrieval, versioning) + - **Rationale**: Well-structured behavior-driven tests + - **Status**: No changes needed -### 2. ProcessItem Result Status Handling +--- -**File:** `src/scraper/strategies/BaseScraperStrategy.test.ts` (extend existing) +### Phase 2: HttpFetcher (`src/scraper/fetcher/HttpFetcher.test.ts`) -#### 2.1 Status-Based Counting +#### Refresh-Related Tests to Consolidate -```typescript -describe("page counting with different statuses", () => { - it("should count pages that return 200 OK"); - it("should count pages that return 304 Not Modified"); - it("should count pages that return 404 Not Found"); - it("should NOT count directory discoveries (no content, no pageId)"); - it("should increment pageCount correctly with mixed statuses"); -}); -``` +- [x] **CONSOLIDATE** - Conditional request header tests -**Rationale:** The `shouldCount` logic in `processBatch` is critical for correct progress reporting and needs explicit testing. + - **Current**: Multiple scattered tests for `If-None-Match` header + - **Target**: Two clear tests as specified in the plan + - **Files to update**: `src/scraper/fetcher/HttpFetcher.test.ts` + - **Completed**: Consolidated from 3 tests → 2 tests -#### 2.2 Progress Callback with Statuses +- [x] **CONSOLIDATE** - 304 response handling tests -```typescript -describe("progress callback with different statuses", () => { - it("should call progressCallback with result=null for 304 responses"); - it("should call progressCallback with result=null for 404 responses"); - it("should call progressCallback with deleted=true for 404 responses"); - it("should call progressCallback with full result for 200 responses"); - it("should include pageId in progress for refresh operations"); -}); -``` + - **Current**: Multiple tests for 304 behavior + - **Target**: Consolidate into focused behavior tests + - **Files to update**: `src/scraper/fetcher/HttpFetcher.test.ts` + - **Completed**: Consolidated from 3 tests → 1 test (correctly mocked as success, not error) -**Rationale:** Progress callbacks are how external systems track refresh progress. We need to verify they receive correct information for each status. +- [x] **CONSOLIDATE** - ETag extraction tests + - **Current**: Multiple tests for ETag formats + - **Target**: Single test with multiple format examples + - **Files to update**: `src/scraper/fetcher/HttpFetcher.test.ts` + - **Completed**: Consolidated from 2 tests → 1 test -### 3. ETag Handling Unit Tests +#### Non-Refresh Tests to Consolidate -**File:** `src/scraper/fetcher/HttpFetcher.test.ts` (extend existing) +- [x] **CONSOLIDATE** - Retry logic tests -#### 3.1 Conditional Request Headers + - **Current**: One test per status code (429, 500, 503, etc.) + - **Target**: Two primary tests: + - One for retryable statuses `[408, 429, 500, 502, 503, 504, 525]` + - One for non-retryable statuses `[400, 401, 403, 405, 410]` + - **Files to update**: `src/scraper/fetcher/HttpFetcher.test.ts` + - **Completed**: Consolidated from 5 tests → 2 tests -```typescript -describe("conditional request headers", () => { - it("should send If-None-Match header when etag is provided"); - it("should NOT send If-None-Match header when etag is null"); - it("should NOT send If-None-Match header when etag is undefined"); - it("should handle etag with quotes correctly"); - it("should handle etag without quotes correctly"); -}); -``` +- [x] **KEEP** - Cancellation and redirect handling tests + - **Rationale**: Excellent examples of testing observable behavior + - **Action**: No changes needed + - **Result**: All 31 tests passing -**Rationale:** ETag header formatting is critical for conditional requests. We need to verify it follows HTTP standards. +--- -#### 3.2 ETag in Response +### Phase 3: FileFetcher (`src/scraper/fetcher/FileFetcher.test.ts`) -```typescript -describe("ETag extraction from responses", () => { - it("should extract ETag from 200 responses"); - it("should preserve ETag from 304 responses"); - it("should handle missing ETag header gracefully"); - it("should handle weak ETags (W/) correctly"); - it("should normalize ETag quotes consistently"); -}); -``` - -**Rationale:** ETag extraction must be consistent to enable proper change detection in future refreshes. +#### Refresh-Related Tests -### 4. FileFetcher ETag Tests +- [x] **REMOVE** - "Mtime-based ETag generation" tests -**File:** `src/scraper/fetcher/FileFetcher.test.ts` (new file) + - **Rationale**: Implementation detail (how ETags are generated) + - **Tests to remove**: + - "should generate ETag from file mtime" + - "should return same ETag for unchanged files" + - "should return different ETag when file is modified" + - **Files to update**: `src/scraper/fetcher/FileFetcher.test.ts` + - **Completed**: Removed 3 implementation detail tests -#### 4.1 Mtime-Based ETag Generation +- [x] **CONSOLIDATE** - "File status detection for refresh" tests + - **Current**: Multiple granular tests + - **Target**: Four core behavioral tests: + - "should return NOT_MODIFIED when fetching an unchanged file with its etag" + - "should return SUCCESS when fetching a modified file with its old etag" + - "should return NOT_FOUND when the file has been deleted" + - "should return SUCCESS when fetching a new file without an etag" + - **Files to update**: `src/scraper/fetcher/FileFetcher.test.ts` + - **Completed**: Consolidated from 6 tests → 4 focused behavior tests + - **Result**: All 15 tests passing -```typescript -describe("mtime-based ETag generation", () => { - it("should generate ETag from file mtime"); - it("should return same ETag for unchanged files"); - it("should return different ETag when file is modified"); - it("should handle files without mtime gracefully"); - it("should generate consistent ETag format (ISO string)"); -}); -``` +#### Non-Refresh Tests to Consolidate -**Rationale:** FileFetcher uses mtime as ETag equivalent. This needs explicit testing to ensure it works correctly. +- [x] **CONSOLIDATED** - MIME type detection tests + - **Previous**: Single large test checking all file types inline + - **Current**: Parameterized test using `it.each` with file extension to MIME type mapping + - **Benefits**: + - Better test output (13 individual test cases vs 1 monolithic test) + - Each file type tested independently + - Easy to add new file types + - Clear test names showing exactly what's being tested + - **Files updated**: `src/scraper/fetcher/FileFetcher.test.ts` + - **Status**: All 27 tests passing (15 baseline + 4 refresh + 8 other tests) + - **Note**: Converted from 1 test with 13 inline checks → 13 parameterized tests -#### 4.2 File Status Detection +--- -```typescript -describe("file status detection", () => { - it("should return SUCCESS when file exists"); - it("should return NOT_FOUND when file does not exist"); - it("should return NOT_MODIFIED when mtime matches etag"); - it("should return SUCCESS when mtime differs from etag"); - it("should handle permission errors appropriately"); -}); -``` +### Phase 4: BaseScraperStrategy (`src/scraper/strategies/BaseScraperStrategy.test.ts`) -**Rationale:** File status detection drives refresh logic for local files and needs thorough testing. +#### Refresh-Related Tests to Add -### 5. PipelineWorker Refresh Logic +- [x] **ALREADY PRESENT** - `describe("Refresh mode with initialQueue", ...)` -**File:** `src/pipeline/PipelineWorker.test.ts` (extend existing) + - **Tests present**: + - "should prioritize initialQueue items before discovering new links" + - "should preserve pageId from initialQueue items" + - "should preserve etag from initialQueue items" + - "should not duplicate root URL if already in initialQueue" + - **Files**: `src/scraper/strategies/BaseScraperStrategy.test.ts` + - **Status**: All 4 tests passing -#### 5.1 Status-Based Database Operations +- [x] **ALREADY PRESENT** - `describe("Page counting with different fetch statuses", ...)` -```typescript -describe("database operations based on fetch status", () => { - it("should skip database operations for 304 Not Modified"); - it("should delete and re-insert for 200 OK with pageId"); - it("should insert new page for 200 OK without pageId"); - it("should call deletePage for 404 Not Found"); - it("should not process content for 404 Not Found"); -}); -``` + - **Tests present**: + - "should count pages that return 200 OK" + - "should count pages that return 304 Not Modified" + - "should count pages that return 404 Not Found" + - **Files**: `src/scraper/strategies/BaseScraperStrategy.test.ts` + - **Status**: All 3 tests passing + - **Note**: Removed 1 test checking implementation details (totalPages calculation) -**Rationale:** PipelineWorker orchestrates database operations based on status. This critical logic needs unit tests. +- [x] **ALREADY PRESENT** - `describe("Progress callbacks with different statuses", ...)` + - **Tests present**: + - "should call progressCallback with result=null for 304 responses" + - "should call progressCallback with deleted=true for 404 responses" + - "should include pageId in progress for refresh operations" + - **Files**: `src/scraper/strategies/BaseScraperStrategy.test.ts` + - **Status**: All 3 tests passing -#### 5.2 PageId Handling +#### Non-Refresh Tests to Refine -```typescript -describe("pageId handling during refresh", () => { - it("should use pageId from scrape result when available"); - it("should handle missing pageId for new pages"); - it("should pass pageId to removeDocumentsByPageId"); - it("should pass pageId to deletePage"); - it("should preserve pageId in progress events"); -}); -``` +- [x] **ALREADY WELL-STRUCTURED** - URL filtering tests -**Rationale:** PageId is the key identifier for refresh operations. We need to verify it's handled correctly throughout the pipeline. + - **Current**: Well-organized tests covering all scenarios + - **Tests present**: 6 tests covering include/exclude with glob/regex patterns + - **Status**: All tests passing, no changes needed -### 6. DocumentStore Deletion Methods +- [x] **KEEP** - Core crawling tests + - **Tests to keep**: + - maxPages and maxDepth enforcement + - URL deduplication + - Breadth-first search ordering + - **Rationale**: Excellent behavior-driven tests + - **Action**: No changes needed + - **Status**: All tests passing -**File:** `src/store/DocumentStore.test.ts` (extend existing) +--- -#### 6.1 deletePage Method +### Phase 5: PipelineWorker (`src/pipeline/PipelineWorker.test.ts`) -```typescript -describe("deletePage method", () => { - it("should delete page and all associated documents via CASCADE"); - it("should return true when page exists and is deleted"); - it("should return false when page does not exist"); - it("should handle concurrent deletions gracefully"); - it("should not affect other pages in same version"); -}); -``` +#### Tests Added -**Rationale:** The new deletePage method is critical for proper 404 handling. It needs comprehensive unit tests. +- [x] **ADDED** - `describe("Database operations based on fetch status", ...)` + - **Tests added**: + - "should perform NO database writes for a 304 Not Modified status" + - "should DELETE existing documents and INSERT new ones for a 200 OK status on an existing page" + - "should INSERT new documents for a 200 OK status on a new page" + - "should call deletePage for a 404 Not Found status" + - **Rationale**: This is the critical integration point where HTTP status codes translate to database state + - **Files updated**: `src/pipeline/PipelineWorker.test.ts` + - **Status**: All 4 new tests passing (10 total tests in file) -#### 6.2 removeDocumentsByPageId Method +--- -```typescript -describe("removeDocumentsByPageId method", () => { - it("should remove all documents for given pageId"); - it("should return count of documents removed"); - it("should not affect documents from other pages"); - it("should handle non-existent pageId gracefully"); - it("should handle empty document set gracefully"); -}); -``` +### Phase 6: E2E Tests (`test/refresh-pipeline-e2e.test.ts`) -**Rationale:** This method is used during content updates (200 OK). We need to verify it works correctly. +#### Tests Added -### 7. Strategy-Specific Refresh Tests +- [x] **ADDED** - Multi-status refresh scenarios -**File:** `src/scraper/strategies/WebScraperStrategy.test.ts` (extend existing) + - ✅ "should delete documents when a page returns 404 during refresh" + - ✅ "should update documents when a page has changed content during refresh" + - ✅ "should skip processing when pages return 304 Not Modified" + - ✅ "should discover and index new pages during refresh" -#### 7.1 ETag Propagation +- [x] **ADDED** - File-based refresh scenarios -```typescript -describe("ETag propagation in WebScraperStrategy", () => { - it("should pass etag from QueueItem to fetcher"); - it("should preserve etag in ProcessItemResult"); - it("should update etag when content changes (200 OK)"); - it("should preserve etag when content unchanged (304)"); - it("should clear etag for deleted pages (404)"); -}); -``` + - ✅ "should detect new files, modified files, and deleted files during refresh" + - ✅ "should handle unchanged files efficiently during file-based refresh" -**Rationale:** We need to verify ETags flow correctly through the web scraping pipeline. +- [x] **ADDED** - Standard scrape error handling -#### 7.2 Refresh with Redirects + - ✅ "should gracefully handle 404 errors for broken links during normal scraping" + - ✅ "should continue scraping after encountering multiple 404 errors" -```typescript -describe("refresh with URL redirects", () => { - it("should update canonical URL after redirect"); - it("should use new ETag after redirect"); - it("should handle redirect to same domain"); - it("should handle redirect during refresh operation"); -}); -``` +- [x] **ADDED** - Edge cases & resiliency + - ✅ "should handle network timeouts gracefully and continue processing other pages" + - ✅ "should follow redirects and use the final URL for indexing" + - ✅ "should handle redirect chains during refresh and update canonical URLs" -**Rationale:** Redirects during refresh can complicate URL tracking. This needs explicit testing. +**Status**: All 11 E2E tests passing. Complete end-to-end validation of refresh pipeline functionality using `nock` for HTTP mocking and `memfs` for file system mocking. -### 8. LocalFileStrategy Refresh Tests +**Key Fixes Made**: -**File:** `src/scraper/strategies/LocalFileStrategy.test.ts` (extend existing) +- Fixed `PipelineManager.enqueueRefreshJob` to not override `maxPages` from stored options +- Fixed file-based test to properly reset `memfs` volume before modifying file structure +- All tests now use mocked responses (no real network calls or timeouts) -#### 8.1 File Modification Detection +--- -```typescript -describe("file modification detection", () => { - it("should detect when file mtime has changed"); - it("should skip processing when mtime unchanged"); - it("should handle file deletion during refresh"); - it("should discover new files during refresh"); -}); -``` +## Implementation Approach -**Rationale:** The existing refresh tests in LocalFileStrategy are good but can be expanded with more specific mtime scenarios. +### Step-by-Step Process -#### 8.2 Directory Re-scanning +1. **Start with removals** - Clean up implementation detail tests first +2. **Then consolidate** - Combine similar tests into more powerful versions +3. **Finally add** - Implement missing behavioral tests +4. **Verify** - Run full test suite after each phase -```typescript -describe("directory re-scanning during refresh", () => { - it("should discover files added to directory"); - it("should detect files removed from directory"); - it("should handle nested directory changes"); - it("should preserve depth for existing files"); -}); -``` +### For Each Test File -**Rationale:** Directory refresh requires full re-scan. We need to verify this works correctly. +1. Review the current test structure +2. Identify tests that match the "remove" or "consolidate" criteria +3. Make changes incrementally +4. Run tests after each change to ensure nothing breaks +5. Commit changes with clear messages -### 9. GitHubScraperStrategy Refresh Tests +--- -**File:** `src/scraper/strategies/GitHubScraperStrategy.test.ts` (extend existing) - -#### 9.1 Mixed Content Refresh - -```typescript -describe("mixed wiki and file refresh", () => { - it("should refresh wiki pages with HTTP ETags"); - it("should refresh repository files with API ETags"); - it("should handle wiki deletion gracefully"); - it("should discover new files added to repository"); - it("should handle tree API rate limiting"); -}); -``` - -**Rationale:** GitHub strategy handles both wiki and files. Refresh logic for both needs validation. - -### 10. Edge Cases and Error Scenarios - -**File:** `test/refresh-edge-cases-e2e.test.ts` (new file) - -#### 10.1 Network Failures During Refresh - -```typescript -describe("network failures during refresh", () => { - it("should handle timeout for single page gracefully"); - it("should continue refresh after network error"); - it("should mark job as failed after multiple errors"); - it("should preserve valid pages after partial failure"); -}); -``` - -**Rationale:** Network issues are common in production. We need to verify graceful degradation. - -#### 10.2 Database Failures During Refresh - -```typescript -describe("database failures during refresh", () => { - it("should rollback transaction on deletion failure"); - it("should handle constraint violations gracefully"); - it("should recover from temporary lock contention"); - it("should preserve database consistency on error"); -}); -``` - -**Rationale:** Database operations can fail. We need to verify error handling maintains consistency. - -#### 10.3 Concurrent Refresh Operations - -```typescript -describe("concurrent refresh operations", () => { - it("should handle concurrent refreshes of same version"); - it("should handle concurrent refreshes of different versions"); - it("should prevent duplicate processing of same URL"); - it("should maintain database consistency with concurrent writes"); -}); -``` - -**Rationale:** Production systems may trigger multiple refreshes. We need to verify concurrent safety. - -#### 10.4 Malformed ETag Handling - -```typescript -describe("malformed ETag handling", () => { - it("should handle ETag with special characters"); - it("should handle very long ETags"); - it("should handle empty ETag string"); - it("should handle ETag with invalid quotes"); - it("should fall back gracefully with malformed ETags"); -}); -``` - -**Rationale:** Real-world servers may return non-standard ETags. We need robust handling. - -## Implementation Priority - -### Phase 1: Critical Unit Tests (High Priority) - -1. **BaseScraperStrategy initialQueue handling** - Core refresh functionality -2. **PipelineWorker status-based operations** - Database consistency -3. **DocumentStore deletePage** - New method validation -4. **HttpFetcher conditional headers** - ETag correctness - -### Phase 2: Strategy-Specific Tests (Medium Priority) - -5. **WebScraperStrategy ETag propagation** - Most common use case -6. **LocalFileStrategy file modification** - File-based refresh -7. **FileFetcher status detection** - File-based change detection - -### Phase 3: Edge Cases (Lower Priority) - -8. **Network failures** - Production resilience -9. **Concurrent operations** - Scale testing -10. **Malformed data handling** - Robustness - -## Testing Approach - -### Unit Tests - -- **Isolation**: Mock external dependencies (filesystem, network, database) -- **Speed**: Should run in <100ms per test -- **Clarity**: Each test validates one specific behavior -- **Coverage**: Aim for >90% line coverage of refresh code paths - -### Integration Tests - -- **Realistic**: Use in-memory database but real HTTP mocking -- **Comprehensive**: Test full workflows end-to-end -- **Performance**: Should complete in <5 seconds per test -- **Scenarios**: Cover common real-world refresh patterns +## Success Criteria -### E2E Tests +- [ ] **All tests are behavior-driven** - No implementation details tested +- [ ] **Unit tests run fast** - Component tests complete in <5 seconds total +- [ ] **E2E tests are comprehensive** - Complete workflows validated end-to-end +- [ ] **Tests are maintainable** - Clear, focused, easy to update +- [ ] **Full test coverage** - All public contracts have tests -- **Complete**: Use full stack including pipeline workers -- **Realistic**: Mock external services (GitHub API, web servers) -- **Validation**: Verify database state after operations -- **Time**: May take 10-30 seconds per test +--- -## Success Criteria +## What Makes a Good Test? -1. **Code Coverage**: >90% line coverage for refresh-related code -2. **Test Speed**: Unit tests complete in <5 seconds total -3. **Reliability**: All tests pass consistently (no flakiness) -4. **Documentation**: Each test has clear description of what it validates -5. **Maintainability**: Tests use helpers/fixtures to reduce duplication +### ✅ Good Tests -## Non-Goals +- Test observable behavior: "File change detection returns SUCCESS for modified files" +- Test the contract: "404 status results in page deletion" +- Test integration points: "PipelineWorker correctly translates status codes to database operations" +- Use realistic scenarios: "Refresh with mix of 304, 200, and 404 responses" -- **Performance benchmarking**: Not testing refresh speed, only correctness -- **Load testing**: Not testing high-volume refresh scenarios -- **Integration with real services**: All external services should be mocked -- **UI testing**: Refresh is a backend feature with no UI +### ❌ Bad Tests -## Open Questions +- Test implementation details: "ETag is generated from mtime timestamp" +- Test internal state: "Queue contains exactly N items" +- Test trivial behavior: "Function returns the value it was given" +- Over-mock: Mocking every dependency makes tests fragile and meaningless -1. Should we test ETag generation algorithms directly, or only their behavior? -2. How do we test CASCADE DELETE without actually running migrations in tests? -3. Should we add property-based tests for ETag normalization? -4. Do we need tests for refresh cancellation mid-operation? +--- ## References - Existing E2E tests: `test/refresh-pipeline-e2e.test.ts` - Refresh architecture: `docs/refresh-architecture.md` - Strategy unit tests: `src/scraper/strategies/*.test.ts` +- Fetcher unit tests: `src/scraper/fetcher/*.test.ts` +- Store unit tests: `src/store/DocumentStore.test.ts` diff --git a/src/pipeline/PipelineManager.test.ts b/src/pipeline/PipelineManager.test.ts index ac2d8b40..25849a9c 100644 --- a/src/pipeline/PipelineManager.test.ts +++ b/src/pipeline/PipelineManager.test.ts @@ -639,8 +639,8 @@ describe("PipelineManager", () => { expect(scraperOpts?.initialQueue).toBeDefined(); expect(scraperOpts?.initialQueue).toHaveLength(mockPages.length); - // Verify maxPages is set to the page count - expect(scraperOpts?.maxPages).toBe(mockPages.length); + // Verify maxPages is NOT set (allowing discovery of new pages during refresh) + expect(scraperOpts?.maxPages).toBeUndefined(); }); it("should handle unversioned libraries during refresh", async () => { diff --git a/src/pipeline/PipelineManager.ts b/src/pipeline/PipelineManager.ts index eab25580..8dc9a7fd 100644 --- a/src/pipeline/PipelineManager.ts +++ b/src/pipeline/PipelineManager.ts @@ -370,7 +370,6 @@ export class PipelineManager implements IPipeline { ...(storedOptions?.options || {}), // Include stored options if available (spread first) // Override with refresh-specific options (these must come after the spread) initialQueue, // Pre-populated queue with existing pages - maxPages: pages.length, isRefresh: true, // Mark this as a refresh operation }; diff --git a/src/pipeline/PipelineWorker.test.ts b/src/pipeline/PipelineWorker.test.ts index 068f0104..d1b05ac1 100644 --- a/src/pipeline/PipelineWorker.test.ts +++ b/src/pipeline/PipelineWorker.test.ts @@ -351,4 +351,181 @@ describe("PipelineWorker", () => { // Verify addScrapeResult was NOT called (deletion failed before that) expect(mockStore.addScrapeResult).not.toHaveBeenCalled(); }); + + describe("Database operations based on fetch status", () => { + it("should perform NO database writes for a 304 Not Modified status", async () => { + // Simulate scrape yielding a 304 Not Modified event + (mockScraperService.scrape as Mock).mockImplementation( + async (_options, progressCallback, _signal) => { + const progress: ScraperProgressEvent = { + pagesScraped: 1, + totalPages: 1, + currentUrl: "url1", + depth: 1, + maxDepth: 1, + result: null, // No result for 304 + deleted: false, + pageId: 123, // Page ID from refresh queue + totalDiscovered: 0, + }; + await progressCallback(progress); + }, + ); + + await worker.executeJob(mockJob, mockCallbacks); + + // Verify NO database operations were performed + expect(mockStore.deletePage).not.toHaveBeenCalled(); + expect(mockStore.addScrapeResult).not.toHaveBeenCalled(); + + // Verify progress was still reported + expect(mockCallbacks.onJobProgress).toHaveBeenCalledOnce(); + expect(mockCallbacks.onJobProgress).toHaveBeenCalledWith( + mockJob, + expect.objectContaining({ + result: null, + deleted: false, + pageId: 123, + }), + ); + }); + + it("should DELETE existing documents and INSERT new ones for a 200 OK status on an existing page", async () => { + const mockResult: ScrapeResult = { + textContent: "updated content", + url: "url1", + title: "Updated Doc", + contentType: "text/html", + chunks: [], + links: [], + errors: [], + }; + + // Simulate scrape yielding a 200 OK event with pageId (existing page) + (mockScraperService.scrape as Mock).mockImplementation( + async (_options, progressCallback, _signal) => { + const progress: ScraperProgressEvent = { + pagesScraped: 1, + totalPages: 1, + currentUrl: "url1", + depth: 1, + maxDepth: 1, + result: mockResult, + pageId: 123, // Existing page ID + totalDiscovered: 0, + }; + await progressCallback(progress); + }, + ); + + await worker.executeJob(mockJob, mockCallbacks); + + // Verify DELETE was called first + expect(mockStore.deletePage).toHaveBeenCalledOnce(); + expect(mockStore.deletePage).toHaveBeenCalledWith(123); + + // Verify INSERT (addScrapeResult) was called after deletion + expect(mockStore.addScrapeResult).toHaveBeenCalledOnce(); + expect(mockStore.addScrapeResult).toHaveBeenCalledWith( + mockJob.library, + mockJob.version, + 1, + mockResult, + ); + + // Verify call order: delete before add + const deleteCallOrder = (mockStore.deletePage as Mock).mock.invocationCallOrder[0]; + const addCallOrder = (mockStore.addScrapeResult as Mock).mock + .invocationCallOrder[0]; + expect(deleteCallOrder).toBeLessThan(addCallOrder); + + // Verify progress was reported + expect(mockCallbacks.onJobProgress).toHaveBeenCalledOnce(); + }); + + it("should INSERT new documents for a 200 OK status on a new page", async () => { + const mockResult: ScrapeResult = { + textContent: "new content", + url: "url2", + title: "New Doc", + contentType: "text/html", + chunks: [], + links: [], + errors: [], + }; + + // Simulate scrape yielding a 200 OK event without pageId (new page) + (mockScraperService.scrape as Mock).mockImplementation( + async (_options, progressCallback, _signal) => { + const progress: ScraperProgressEvent = { + pagesScraped: 1, + totalPages: 1, + currentUrl: "url2", + depth: 1, + maxDepth: 1, + result: mockResult, + pageId: undefined, // No pageId = new page + totalDiscovered: 0, + }; + await progressCallback(progress); + }, + ); + + await worker.executeJob(mockJob, mockCallbacks); + + // Verify NO deletion was performed (new page) + expect(mockStore.deletePage).not.toHaveBeenCalled(); + + // Verify INSERT (addScrapeResult) was called + expect(mockStore.addScrapeResult).toHaveBeenCalledOnce(); + expect(mockStore.addScrapeResult).toHaveBeenCalledWith( + mockJob.library, + mockJob.version, + 1, + mockResult, + ); + + // Verify progress was reported + expect(mockCallbacks.onJobProgress).toHaveBeenCalledOnce(); + }); + + it("should call deletePage for a 404 Not Found status", async () => { + // Simulate scrape yielding a 404 Not Found event + (mockScraperService.scrape as Mock).mockImplementation( + async (_options, progressCallback, _signal) => { + const progress: ScraperProgressEvent = { + pagesScraped: 1, + totalPages: 1, + currentUrl: "url1", + depth: 1, + maxDepth: 1, + result: null, + deleted: true, // 404 - page was deleted + pageId: 123, + totalDiscovered: 0, + }; + await progressCallback(progress); + }, + ); + + await worker.executeJob(mockJob, mockCallbacks); + + // Verify deletion was called + expect(mockStore.deletePage).toHaveBeenCalledOnce(); + expect(mockStore.deletePage).toHaveBeenCalledWith(123); + + // Verify NO insert was performed + expect(mockStore.addScrapeResult).not.toHaveBeenCalled(); + + // Verify progress was reported + expect(mockCallbacks.onJobProgress).toHaveBeenCalledOnce(); + expect(mockCallbacks.onJobProgress).toHaveBeenCalledWith( + mockJob, + expect.objectContaining({ + deleted: true, + pageId: 123, + }), + ); + }); + }); }); diff --git a/src/scraper/fetcher/FileFetcher.test.ts b/src/scraper/fetcher/FileFetcher.test.ts index 5a7dac3a..7a96497c 100644 --- a/src/scraper/fetcher/FileFetcher.test.ts +++ b/src/scraper/fetcher/FileFetcher.test.ts @@ -43,73 +43,30 @@ describe("FileFetcher", () => { expect(result.mimeType).toBe("text/html"); }); - it("should detect source code MIME types correctly", async () => { + it.each([ + [".ts", "text/x-typescript", "interface User { name: string; }"], + [".tsx", "text/x-tsx", "export const App = () =>
Hello
;"], + [".py", "text/x-python", "def hello(): print('world')"], + [".go", "text/x-go", "package main\nfunc main() {}"], + [".rs", "text/x-rust", 'fn main() { println!("Hello"); }'], + [".kt", "text/x-kotlin", 'fun main() { println("Hello") }'], + [".rb", "text/x-ruby", "puts 'Hello world'"], + [".js", "text/javascript", "console.log('Hello');"], + [".css", "text/css", "body { margin: 0; }"], + [".json", "application/json", '{"name": "test"}'], + [".xml", "application/xml", ""], + [".md", "text/markdown", "# Hello"], + [".sh", "text/x-shellscript", "#!/bin/bash\necho hello"], + ])("should detect %s files as %s", async (extension, expectedMimeType, content) => { const fetcher = new FileFetcher(); - const files = { - "/code/app.ts": "interface User { name: string; }", - "/code/component.tsx": "export const App = () =>
Hello
;", - "/code/script.py": "def hello(): print('world')", - "/code/main.go": "package main\nfunc main() {}", - "/code/lib.rs": 'fn main() { println!("Hello"); }', - "/code/App.kt": 'fun main() { println("Hello") }', - "/code/script.rb": "puts 'Hello world'", - "/code/index.js": "console.log('Hello');", - "/code/style.css": "body { margin: 0; }", - "/code/data.json": '{"name": "test"}', - "/code/config.xml": "", - "/code/readme.md": "# Hello", - "/code/script.sh": "#!/bin/bash\necho hello", - }; - - vol.fromJSON(files); - - // Test TypeScript files - const tsResult = await fetcher.fetch("file:///code/app.ts"); - expect(tsResult.mimeType).toBe("text/x-typescript"); - - const tsxResult = await fetcher.fetch("file:///code/component.tsx"); - expect(tsxResult.mimeType).toBe("text/x-tsx"); - - // Test Python files - const pyResult = await fetcher.fetch("file:///code/script.py"); - expect(pyResult.mimeType).toBe("text/x-python"); - - // Test Go files - const goResult = await fetcher.fetch("file:///code/main.go"); - expect(goResult.mimeType).toBe("text/x-go"); - - // Test Rust files - const rsResult = await fetcher.fetch("file:///code/lib.rs"); - expect(rsResult.mimeType).toBe("text/x-rust"); - - // Test Kotlin files - const ktResult = await fetcher.fetch("file:///code/App.kt"); - expect(ktResult.mimeType).toBe("text/x-kotlin"); - - // Test Ruby files - const rbResult = await fetcher.fetch("file:///code/script.rb"); - expect(rbResult.mimeType).toBe("text/x-ruby"); - - // Test JavaScript files (fallback to mime package) - const jsResult = await fetcher.fetch("file:///code/index.js"); - expect(jsResult.mimeType).toBe("text/javascript"); - - // Test shell scripts - const shResult = await fetcher.fetch("file:///code/script.sh"); - expect(shResult.mimeType).toBe("text/x-shellscript"); - - // Test other file types (fallback to mime package) - const cssResult = await fetcher.fetch("file:///code/style.css"); - expect(cssResult.mimeType).toBe("text/css"); - - const jsonResult = await fetcher.fetch("file:///code/data.json"); - expect(jsonResult.mimeType).toBe("application/json"); - - const xmlResult = await fetcher.fetch("file:///code/config.xml"); - expect(xmlResult.mimeType).toBe("application/xml"); - - const mdResult = await fetcher.fetch("file:///code/readme.md"); - expect(mdResult.mimeType).toBe("text/markdown"); + const fileName = `/code/file${extension}`; + + vol.fromJSON({ + [fileName]: content, + }); + + const result = await fetcher.fetch(`file://${fileName}`); + expect(result.mimeType).toBe(expectedMimeType); }); it("should return status NOT_FOUND if file does not exist", async () => { @@ -223,4 +180,81 @@ describe("FileFetcher", () => { expect(result.mimeType).toBe("text/plain"); expect(result.source).toBe("file://Users/testuser/foo/bar/file.txt"); }); + + describe("File status detection for refresh", () => { + beforeEach(() => { + vol.reset(); + }); + + it("should return NOT_MODIFIED when fetching an unchanged file with its etag", async () => { + const fetcher = new FileFetcher(); + const filePath = "/test/unchanged.txt"; + + vol.fromJSON({ + [filePath]: "content", + }); + + // First fetch to get the ETag + const result1 = await fetcher.fetch(`file://${filePath}`); + const etag = result1.etag; + + // Second fetch with the same ETag should return NOT_MODIFIED + const result2 = await fetcher.fetch(`file://${filePath}`, { etag }); + + expect(result2.status).toBe("not_modified"); + expect(result2.etag).toBe(etag); + expect(result2.content).toEqual(Buffer.from("")); + }); + + it("should return SUCCESS when fetching a modified file with its old etag", async () => { + const fetcher = new FileFetcher(); + const filePath = "/test/modified.txt"; + + // Create initial file + vol.fromJSON({ + [filePath]: "initial", + }); + + const result1 = await fetcher.fetch(`file://${filePath}`); + const oldEtag = result1.etag; + + // Wait and modify file + await new Promise((resolve) => setTimeout(resolve, 10)); + vol.fromJSON({ + [filePath]: "modified", + }); + + // Fetch with old ETag should detect change and return SUCCESS + const result2 = await fetcher.fetch(`file://${filePath}`, { etag: oldEtag }); + + expect(result2.status).toBe("success"); + expect(result2.etag).not.toBe(oldEtag); + expect(result2.content.toString()).toBe("modified"); + }); + + it("should return NOT_FOUND when the file has been deleted", async () => { + const fetcher = new FileFetcher(); + + const result = await fetcher.fetch("file:///test/does-not-exist.txt"); + + expect(result.status).toBe("not_found"); + expect(result.content).toEqual(Buffer.from("")); + }); + + it("should return SUCCESS when fetching a new file without an etag", async () => { + const fetcher = new FileFetcher(); + const filePath = "/test/file.txt"; + + vol.fromJSON({ + [filePath]: "content", + }); + + // Fetch without etag should always return SUCCESS + const result = await fetcher.fetch(`file://${filePath}`); + + expect(result.status).toBe("success"); + expect(result.etag).toBeTruthy(); + expect(result.content.toString()).toBe("content"); + }); + }); }); diff --git a/src/scraper/fetcher/HttpFetcher.test.ts b/src/scraper/fetcher/HttpFetcher.test.ts index 603249d8..3330c7c0 100644 --- a/src/scraper/fetcher/HttpFetcher.test.ts +++ b/src/scraper/fetcher/HttpFetcher.test.ts @@ -267,9 +267,8 @@ describe("HttpFetcher", () => { }); describe("retry logic", () => { - it("should retry on all retryable HTTP status codes", async () => { + it("should retry on retryable status codes [408, 429, 500, 502, 503, 504, 525]", async () => { const fetcher = new HttpFetcher(); - // Test all retryable status codes from HttpFetcher: 408, 429, 500, 502, 503, 504, 525 const retryableStatuses = [408, 429, 500, 502, 503, 504, 525]; for (const status of retryableStatuses) { @@ -290,9 +289,8 @@ describe("HttpFetcher", () => { } }); - it("should not retry on non-retryable HTTP status codes", async () => { + it("should not retry on non-retryable status codes [400, 401, 403, 404, 405, 410]", async () => { const fetcher = new HttpFetcher(); - // Test various non-retryable status codes (excluding 404 which has special handling) const nonRetryableStatuses = [400, 401, 403, 405, 410]; for (const status of nonRetryableStatuses) { @@ -308,10 +306,9 @@ describe("HttpFetcher", () => { expect(mockedAxios.get).toHaveBeenCalledTimes(1); // No retries } - }); - it("should return not_found status for 404 responses", async () => { - const fetcher = new HttpFetcher(); + // 404 has special handling - returns result instead of throwing + mockedAxios.get.mockReset(); mockedAxios.get.mockRejectedValue({ response: { status: 404 } }); const result = await fetcher.fetch("https://example.com", { @@ -319,110 +316,9 @@ describe("HttpFetcher", () => { retryDelay: 1, }); - // 404 should return result with not_found status instead of throwing expect(result.status).toBe("not_found"); - expect(mockedAxios.get).toHaveBeenCalledTimes(1); // No retries - }); - - it("should retry on undefined status (network errors)", async () => { - const fetcher = new HttpFetcher(); - // Simulate network error without response object - mockedAxios.get.mockRejectedValueOnce(new Error("Network timeout")); - mockedAxios.get.mockResolvedValueOnce({ - data: Buffer.from("recovered", "utf-8"), - headers: { "content-type": "text/plain" }, - }); - - const result = await fetcher.fetch("https://example.com", { - maxRetries: 1, - retryDelay: 1, - }); - - expect(result.content).toEqual(Buffer.from("recovered", "utf-8")); - expect(mockedAxios.get).toHaveBeenCalledTimes(2); + expect(mockedAxios.get).toHaveBeenCalledTimes(1); // No retries for 404 }); - - it("should use exponential backoff for retry delays", async () => { - const fetcher = new HttpFetcher(); - // Mock setTimeout to spy on delay behavior without actually waiting - const setTimeoutSpy = vi.spyOn(global, "setTimeout"); - - // Mock all retries to fail, then succeed - mockedAxios.get.mockRejectedValueOnce({ response: { status: 500 } }); - mockedAxios.get.mockRejectedValueOnce({ response: { status: 500 } }); - mockedAxios.get.mockRejectedValueOnce({ response: { status: 500 } }); - mockedAxios.get.mockResolvedValueOnce({ - data: Buffer.from("success", "utf-8"), - headers: { "content-type": "text/plain" }, - }); - - // Execute fetch with base delay of 10ms - const baseDelay = 10; - await fetcher.fetch("https://example.com", { - maxRetries: 3, - retryDelay: baseDelay, - }); - - // Verify exponential backoff: baseDelay * 2^attempt - // Attempt 0: 10ms, Attempt 1: 20ms, Attempt 2: 40ms - expect(setTimeoutSpy).toHaveBeenCalledWith(expect.any(Function), 10); - expect(setTimeoutSpy).toHaveBeenCalledWith(expect.any(Function), 20); - expect(setTimeoutSpy).toHaveBeenCalledWith(expect.any(Function), 40); - - setTimeoutSpy.mockRestore(); - }); - }); - - it("should not retry on unretryable HTTP errors", async () => { - const fetcher = new HttpFetcher(); - mockedAxios.get.mockRejectedValue({ response: { status: 404 } }); - - const result = await fetcher.fetch("https://example.com", { - retryDelay: 1, // Use minimal delay - }); - - // Should return result with error status instead of throwing - expect(result.status).toBe("not_found"); - expect(mockedAxios.get).toHaveBeenCalledTimes(1); - }); - - it("should retry on retryable HTTP errors", async () => { - const fetcher = new HttpFetcher(); - const retryableErrors = [429, 500, 503]; - for (const status of retryableErrors) { - mockedAxios.get.mockRejectedValueOnce({ response: { status } }); - } - - const htmlContent = "

Hello

"; - mockedAxios.get.mockResolvedValueOnce({ - data: Buffer.from(htmlContent, "utf-8"), - headers: { "content-type": "text/html" }, - }); - - // Test behavior: retry mechanism should eventually succeed - const result = await fetcher.fetch("https://example.com", { - retryDelay: 1, // Use minimal delay to speed up test - maxRetries: 3, - }); - - expect(mockedAxios.get).toHaveBeenCalledTimes(retryableErrors.length + 1); - expect(result.content).toEqual(Buffer.from(htmlContent, "utf-8")); - }); - - it("should throw error after max retries", async () => { - const fetcher = new HttpFetcher(); - const maxRetries = 2; // Use smaller number for faster test - - mockedAxios.get.mockRejectedValue({ response: { status: 502 } }); - - await expect( - fetcher.fetch("https://example.com", { - maxRetries: maxRetries, - retryDelay: 1, // Use minimal delay - }), - ).rejects.toThrow(ScraperError); - - expect(mockedAxios.get).toHaveBeenCalledTimes(maxRetries + 1); }); it("should generate fingerprint headers", async () => { @@ -595,4 +491,108 @@ describe("HttpFetcher", () => { expect(result.source).toBe(finalUrl); }); }); + + describe("Conditional request headers", () => { + beforeEach(() => { + mockedAxios.get.mockReset(); + }); + + it("should send If-None-Match header when etag is provided", async () => { + const fetcher = new HttpFetcher(); + const mockResponse = { + data: Buffer.from("content", "utf-8"), + headers: { "content-type": "text/plain" }, + }; + mockedAxios.get.mockResolvedValue(mockResponse); + + await fetcher.fetch("https://example.com", { etag: '"abc123"' }); + + expect(mockedAxios.get).toHaveBeenCalledWith( + "https://example.com", + expect.objectContaining({ + headers: expect.objectContaining({ + "If-None-Match": '"abc123"', + }), + }), + ); + }); + + it("should NOT send If-None-Match header when etag is not provided", async () => { + const fetcher = new HttpFetcher(); + const mockResponse = { + data: Buffer.from("content", "utf-8"), + headers: { "content-type": "text/plain" }, + }; + mockedAxios.get.mockResolvedValue(mockResponse); + + await fetcher.fetch("https://example.com"); + + expect(mockedAxios.get).toHaveBeenCalledWith( + "https://example.com", + expect.objectContaining({ + headers: expect.not.objectContaining({ + "If-None-Match": expect.anything(), + }), + }), + ); + }); + }); + + describe("304 Not Modified response handling", () => { + beforeEach(() => { + mockedAxios.get.mockReset(); + }); + + it("should handle 304 responses with status='not_modified', empty content, and no retry", async () => { + const fetcher = new HttpFetcher(); + const etag = '"cached-etag-123"'; + + // 304 is treated as successful by validateStatus, so axios resolves (not rejects) + mockedAxios.get.mockResolvedValue({ + status: 304, + data: Buffer.from(""), // 304 typically has no body + headers: { etag }, + config: {}, + statusText: "Not Modified", + }); + + const result = await fetcher.fetch("https://example.com", { etag }); + + expect(result.status).toBe("not_modified"); + expect(result.etag).toBeUndefined(); // 304 response doesn't extract etag from headers + expect(result.content).toEqual(Buffer.from("")); + expect(mockedAxios.get).toHaveBeenCalledTimes(1); // No retries for 304 + }); + }); + + describe("ETag extraction from responses", () => { + beforeEach(() => { + mockedAxios.get.mockReset(); + }); + + it("should extract etag from response headers (or undefined if missing)", async () => { + const fetcher = new HttpFetcher(); + const etag = '"response-etag-456"'; + + // Test with etag present + mockedAxios.get.mockResolvedValue({ + data: Buffer.from("content", "utf-8"), + headers: { "content-type": "text/plain", etag }, + }); + + const resultWithEtag = await fetcher.fetch("https://example.com"); + expect(resultWithEtag.etag).toBe(etag); + + mockedAxios.get.mockReset(); + + // Test with etag missing + mockedAxios.get.mockResolvedValue({ + data: Buffer.from("content", "utf-8"), + headers: { "content-type": "text/plain" }, + }); + + const resultWithoutEtag = await fetcher.fetch("https://example.com"); + expect(resultWithoutEtag.etag).toBeUndefined(); + }); + }); }); diff --git a/src/scraper/strategies/BaseScraperStrategy.test.ts b/src/scraper/strategies/BaseScraperStrategy.test.ts index 628b303c..884c98fb 100644 --- a/src/scraper/strategies/BaseScraperStrategy.test.ts +++ b/src/scraper/strategies/BaseScraperStrategy.test.ts @@ -587,4 +587,480 @@ describe("BaseScraperStrategy", () => { expect(processedUrls).not.toContain("https://example.com/docs/private/secret"); }); }); + + describe("Refresh mode with initialQueue", () => { + beforeEach(() => { + strategy = new TestScraperStrategy(); + strategy.processItem.mockClear(); + }); + + it("should prioritize initialQueue items before discovering new links", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 10, + maxDepth: 2, + initialQueue: [ + { + url: "https://example.com/existing-page1", + depth: 1, + pageId: 101, + etag: "etag1", + }, + { + url: "https://example.com/existing-page2", + depth: 1, + pageId: 102, + etag: "etag2", + }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockImplementation(async (item: QueueItem) => { + if (item.url === "https://example.com/") { + return { + content: { + textContent: "root", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: ["https://example.com/new-page"], + status: FetchStatus.SUCCESS, + }; + } + return { + content: { + textContent: "page content", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; + }); + + await strategy.scrape(options, progressCallback); + + // Verify initialQueue items are processed before discovered links + const processedUrls = strategy.processItem.mock.calls.map((call) => call[0].url); + const rootIndex = processedUrls.indexOf("https://example.com/"); + const existing1Index = processedUrls.indexOf("https://example.com/existing-page1"); + const existing2Index = processedUrls.indexOf("https://example.com/existing-page2"); + const newPageIndex = processedUrls.indexOf("https://example.com/new-page"); + + // Root URL should be processed first (it's added before initialQueue items) + expect(rootIndex).toBe(0); + + // InitialQueue items should be processed before newly discovered links + expect(existing1Index).toBeLessThan(newPageIndex); + expect(existing2Index).toBeLessThan(newPageIndex); + }); + + it("should preserve pageId from initialQueue items", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 10, + maxDepth: 2, + initialQueue: [ + { + url: "https://example.com/page1", + depth: 1, + pageId: 123, + etag: "etag1", + }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockResolvedValue({ + content: { + textContent: "test", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }); + + await strategy.scrape(options, progressCallback); + + // Verify pageId flows through to processItem call + const page1Call = strategy.processItem.mock.calls.find( + (call) => call[0].url === "https://example.com/page1", + ); + expect(page1Call).toBeDefined(); + expect(page1Call![0].pageId).toBe(123); + }); + + it("should preserve etag from initialQueue items", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 10, + maxDepth: 2, + initialQueue: [ + { + url: "https://example.com/page1", + depth: 1, + pageId: 123, + etag: '"test-etag-123"', + }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockResolvedValue({ + content: { + textContent: "test", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }); + + await strategy.scrape(options, progressCallback); + + // Verify etag flows through to processItem call + const page1Call = strategy.processItem.mock.calls.find( + (call) => call[0].url === "https://example.com/page1", + ); + expect(page1Call).toBeDefined(); + expect(page1Call![0].etag).toBe('"test-etag-123"'); + }); + + it("should not duplicate root URL if already in initialQueue", async () => { + const rootUrl = "https://example.com/"; + const options: ScraperOptions = { + url: rootUrl, + library: "test", + version: "1.0.0", + maxPages: 10, + maxDepth: 2, + initialQueue: [ + { + url: rootUrl, + depth: 0, + pageId: 100, + etag: '"root-etag"', + }, + { + url: "https://example.com/page1", + depth: 1, + pageId: 101, + etag: '"page1-etag"', + }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockResolvedValue({ + content: { + textContent: "test", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }); + + await strategy.scrape(options, progressCallback); + + // Count how many times root URL was processed + const rootCalls = strategy.processItem.mock.calls.filter( + (call) => call[0].url === rootUrl, + ); + expect(rootCalls).toHaveLength(1); + + // Verify it used the pageId and etag from initialQueue + expect(rootCalls[0][0].pageId).toBe(100); + expect(rootCalls[0][0].etag).toBe('"root-etag"'); + }); + }); + + describe("Page counting with different fetch statuses", () => { + beforeEach(() => { + strategy = new TestScraperStrategy(); + strategy.processItem.mockClear(); + }); + + it("should count pages that return 200 OK", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 3, + maxDepth: 1, + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockResolvedValue({ + content: { + textContent: "test", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: ["https://example.com/page1", "https://example.com/page2"], + status: FetchStatus.SUCCESS, + }); + + await strategy.scrape(options, progressCallback); + + // Verify all 3 pages were counted (root + 2 links) + expect(progressCallback).toHaveBeenCalledTimes(3); + const lastCall = progressCallback.mock.calls[2][0]; + expect(lastCall.pagesScraped).toBe(3); + }); + + it("should count pages that return 304 Not Modified", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 3, + maxDepth: 1, + initialQueue: [ + { url: "https://example.com/page1", depth: 1, pageId: 101, etag: "etag1" }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockImplementation(async (item: QueueItem) => { + if (item.url === "https://example.com/") { + return { + content: { + textContent: "root", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: ["https://example.com/page1"], + status: FetchStatus.SUCCESS, + }; + } + // page1 returns 304 + return { + content: null, + links: [], + status: FetchStatus.NOT_MODIFIED, + etag: "etag1", + }; + }); + + await strategy.scrape(options, progressCallback); + + // Verify both pages were counted (root=200, page1=304) + expect(progressCallback).toHaveBeenCalledTimes(2); + const lastCall = progressCallback.mock.calls[1][0]; + expect(lastCall.pagesScraped).toBe(2); + }); + + it("should count pages that return 404 Not Found", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 3, + maxDepth: 1, + initialQueue: [ + { + url: "https://example.com/deleted-page", + depth: 1, + pageId: 101, + etag: "etag1", + }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockImplementation(async (item: QueueItem) => { + if (item.url === "https://example.com/") { + return { + content: { + textContent: "root", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; + } + // deleted-page returns 404 + return { + content: null, + links: [], + status: FetchStatus.NOT_FOUND, + }; + }); + + await strategy.scrape(options, progressCallback); + + // Verify both pages were counted (root=200, deleted-page=404) + expect(progressCallback).toHaveBeenCalledTimes(2); + const lastCall = progressCallback.mock.calls[1][0]; + expect(lastCall.pagesScraped).toBe(2); + }); + }); + + describe("Progress callbacks with different statuses", () => { + beforeEach(() => { + strategy = new TestScraperStrategy(); + strategy.processItem.mockClear(); + }); + + it("should call progressCallback with result=null for 304 responses", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 2, + maxDepth: 1, + initialQueue: [ + { url: "https://example.com/page1", depth: 1, pageId: 101, etag: "etag1" }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockImplementation(async (item: QueueItem) => { + if (item.url === "https://example.com/") { + return { + content: { + textContent: "root", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; + } + // page1 returns 304 + return { + content: null, + links: [], + status: FetchStatus.NOT_MODIFIED, + etag: "etag1", + }; + }); + + await strategy.scrape(options, progressCallback); + + // Find the 304 response progress call + const progress304 = progressCallback.mock.calls.find( + (call) => call[0].currentUrl === "https://example.com/page1", + ); + expect(progress304).toBeDefined(); + expect(progress304![0].result).toBeNull(); + }); + + it("should call progressCallback with deleted=true for 404 responses", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 2, + maxDepth: 1, + initialQueue: [ + { url: "https://example.com/deleted", depth: 1, pageId: 101, etag: "etag1" }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockImplementation(async (item: QueueItem) => { + if (item.url === "https://example.com/") { + return { + content: { + textContent: "root", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; + } + // deleted page returns 404 + return { + content: null, + links: [], + status: FetchStatus.NOT_FOUND, + }; + }); + + await strategy.scrape(options, progressCallback); + + // Find the 404 response progress call + const progress404 = progressCallback.mock.calls.find( + (call) => call[0].currentUrl === "https://example.com/deleted", + ); + expect(progress404).toBeDefined(); + expect(progress404![0].deleted).toBe(true); + expect(progress404![0].result).toBeNull(); + }); + + it("should include pageId in progress for refresh operations", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 3, + maxDepth: 1, + initialQueue: [ + { url: "https://example.com/page1", depth: 1, pageId: 101, etag: "etag1" }, + { url: "https://example.com/page2", depth: 1, pageId: 102, etag: "etag2" }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockResolvedValue({ + content: { + textContent: "test", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }); + + await strategy.scrape(options, progressCallback); + + // Verify pageId flows through to progress events for initialQueue items + const page1Progress = progressCallback.mock.calls.find( + (call) => call[0].currentUrl === "https://example.com/page1", + ); + const page2Progress = progressCallback.mock.calls.find( + (call) => call[0].currentUrl === "https://example.com/page2", + ); + + expect(page1Progress).toBeDefined(); + expect(page1Progress![0].pageId).toBe(101); + + expect(page2Progress).toBeDefined(); + expect(page2Progress![0].pageId).toBe(102); + }); + }); }); diff --git a/src/scraper/strategies/BaseScraperStrategy.ts b/src/scraper/strategies/BaseScraperStrategy.ts index 7c01ba40..17287d45 100644 --- a/src/scraper/strategies/BaseScraperStrategy.ts +++ b/src/scraper/strategies/BaseScraperStrategy.ts @@ -191,16 +191,19 @@ export abstract class BaseScraperStrategy implements ScraperStrategy { } // Handle successful processing - report result with content + // Use the final URL from the result (which may differ due to redirects) + const finalUrl = result.url || item.url; + if (result.content) { await progressCallback({ pagesScraped: currentPageCount, totalPages: this.effectiveTotal, totalDiscovered: this.totalDiscovered, - currentUrl: item.url, + currentUrl: finalUrl, depth: item.depth, maxDepth: maxDepth, result: { - url: item.url, + url: finalUrl, title: result.content.title?.trim() || result.title?.trim() || "", contentType: result.contentType || "", textContent: result.content.textContent || "", @@ -214,11 +217,14 @@ export abstract class BaseScraperStrategy implements ScraperStrategy { }); } + // Extract discovered links - use the final URL as the base for resolving relative links const nextItems = result.links || []; + const linkBaseUrl = finalUrl ? new URL(finalUrl) : baseUrl; + return nextItems .map((value) => { try { - const targetUrl = new URL(value, baseUrl); + const targetUrl = new URL(value, linkBaseUrl); // Filter using shouldProcessUrl if (!this.shouldProcessUrl(targetUrl.href, options)) { return null; @@ -280,17 +286,6 @@ export abstract class BaseScraperStrategy implements ScraperStrategy { const initialQueue = options.initialQueue || []; const isRefreshMode = initialQueue.length > 0; - // Initialize queue and tracking - // Start with 1 to account for the depth 0 URL that will be processed - this.totalDiscovered = 1; - this.effectiveTotal = 1; - - if (isRefreshMode) { - logger.debug( - `Starting refresh mode with ${initialQueue.length} pre-populated pages`, - ); - } - // Set up base URL and queue this.canonicalBaseUrl = new URL(options.url); let baseUrl = this.canonicalBaseUrl; @@ -305,6 +300,10 @@ export abstract class BaseScraperStrategy implements ScraperStrategy { ); if (isRefreshMode) { + logger.debug( + `Starting refresh mode with ${initialQueue.length} pre-populated pages`, + ); + // Add all items from initialQueue, using visited set to deduplicate for (const item of initialQueue) { const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions); @@ -321,6 +320,10 @@ export abstract class BaseScraperStrategy implements ScraperStrategy { queue.unshift({ url: options.url, depth: 0 } satisfies QueueItem); } + // Initialize counters based on actual queue length after population + this.totalDiscovered = queue.length; + this.effectiveTotal = queue.length; + // Resolve optional values to defaults using temporary variables const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES; const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY; diff --git a/src/scraper/strategies/WebScraperStrategy.ts b/src/scraper/strategies/WebScraperStrategy.ts index d8558b07..702650d6 100644 --- a/src/scraper/strategies/WebScraperStrategy.ts +++ b/src/scraper/strategies/WebScraperStrategy.ts @@ -72,9 +72,10 @@ export class WebScraperStrategy extends BaseScraperStrategy { ); // Return the status directly - BaseScraperStrategy handles NOT_MODIFIED and NOT_FOUND + // Use the final URL from rawContent.source (which may differ due to redirects) if (rawContent.status !== FetchStatus.SUCCESS) { logger.debug(`Skipping pipeline for ${url} due to status: ${rawContent.status}`); - return { url, links: [], status: rawContent.status }; + return { url: rawContent.source, links: [], status: rawContent.status }; } // --- Start Pipeline Processing --- @@ -96,7 +97,7 @@ export class WebScraperStrategy extends BaseScraperStrategy { logger.warn( `⚠️ Unsupported content type "${rawContent.mimeType}" for URL ${url}. Skipping processing.`, ); - return { url, links: [], status: FetchStatus.SUCCESS }; + return { url: rawContent.source, links: [], status: FetchStatus.SUCCESS }; } // Log errors from pipeline @@ -110,7 +111,7 @@ export class WebScraperStrategy extends BaseScraperStrategy { `⚠️ No processable content found for ${url} after pipeline execution.`, ); return { - url, + url: rawContent.source, links: processed.links, status: FetchStatus.SUCCESS, }; @@ -139,7 +140,7 @@ export class WebScraperStrategy extends BaseScraperStrategy { }) ?? []; return { - url, + url: rawContent.source, etag: rawContent.etag, lastModified: rawContent.lastModified, contentType: rawContent.mimeType, diff --git a/src/store/DocumentStore.test.ts b/src/store/DocumentStore.test.ts index fc045bc5..716cba07 100644 --- a/src/store/DocumentStore.test.ts +++ b/src/store/DocumentStore.test.ts @@ -463,33 +463,40 @@ describe("DocumentStore - With Embeddings", () => { } }); - it("should batch documents by character size limit", async () => { + it("should successfully embed and store large batches of documents", async () => { // Skip if embeddings are disabled // @ts-expect-error Accessing private property for testing if (!store.embeddings) { return; } - // Create 3 docs that fit 2 per batch by character size - const contentSize = 24000; // 24KB each - for (let i = 0; i < 3; i++) { + // Add multiple large documents to verify batching works correctly + const docCount = 5; + const contentSize = 15000; // 15KB each - ensures batching behavior + + for (let i = 0; i < docCount; i++) { await store.addDocuments( - "testlib", + "batchtest", "1.0.0", 1, createScrapeResult( - `Doc ${i + 1}`, - `https://example.com/doc${i + 1}`, + `Batch Doc ${i + 1}`, + `https://example.com/batch-doc${i + 1}`, "x".repeat(contentSize), ["section"], ), ); } - // Should create 2 batches - first with 2 docs, second with 1 doc - expect(mockEmbedDocuments).toHaveBeenCalledTimes(2); - expect(mockEmbedDocuments.mock.calls[0][0]).toHaveLength(2); - expect(mockEmbedDocuments.mock.calls[1][0]).toHaveLength(1); + // Verify all documents were successfully embedded and stored + expect(await store.checkDocumentExists("batchtest", "1.0.0")).toBe(true); + + // Verify embedDocuments was called (batching occurred) + expect(mockEmbedDocuments).toHaveBeenCalled(); + + // Verify all documents are searchable (embeddings were applied) + const searchResults = await store.findByContent("batchtest", "1.0.0", "Batch", 10); + expect(searchResults.length).toBe(docCount); }); it("should include proper document headers in embedding text", async () => { @@ -954,4 +961,91 @@ describe("DocumentStore - Common Functionality", () => { } }); }); + + describe("Refresh Operations - getPagesByVersionId", () => { + beforeEach(async () => { + // Add pages with etags for building refresh queue + await store.addDocuments( + "refresh-queue-test", + "1.0.0", + 1, + createScrapeResult( + "Page 1", + "https://example.com/page1", + "Content 1", + ["section1"], + { etag: '"etag1"', lastModified: "2023-01-01T00:00:00Z" }, + ), + ); + await store.addDocuments( + "refresh-queue-test", + "1.0.0", + 1, + createScrapeResult( + "Page 2", + "https://example.com/page2", + "Content 2", + ["section2"], + { etag: '"etag2"', lastModified: "2023-01-02T00:00:00Z" }, + ), + ); + await store.addDocuments( + "refresh-queue-test", + "1.0.0", + 1, + createScrapeResult( + "Page 3 No ETag", + "https://example.com/page3", + "Content 3", + ["section3"], + { etag: null, lastModified: null }, + ), + ); + }); + + it("should retrieve all pages with metadata for refresh queue building", async () => { + const versionId = await store.resolveVersionId("refresh-queue-test", "1.0.0"); + const pages = await store.getPagesByVersionId(versionId); + + expect(pages.length).toBe(3); + + // Verify page1 metadata + const page1 = pages.find((p) => p.url === "https://example.com/page1"); + expect(page1).toBeDefined(); + expect(page1!.id).toBeDefined(); + expect(page1!.etag).toBe('"etag1"'); + expect(page1!.depth).toBe(1); + + // Verify page2 metadata + const page2 = pages.find((p) => p.url === "https://example.com/page2"); + expect(page2).toBeDefined(); + expect(page2!.etag).toBe('"etag2"'); + + // Verify page3 (no etag) + const page3 = pages.find((p) => p.url === "https://example.com/page3"); + expect(page3).toBeDefined(); + expect(page3!.etag).toBeNull(); + }); + + it("should return empty array for version with no pages", async () => { + const emptyVersionId = await store.resolveVersionId("empty-lib", "1.0.0"); + const pages = await store.getPagesByVersionId(emptyVersionId); + + expect(pages).toEqual([]); + }); + + it("should include all metadata fields needed for refresh", async () => { + const versionId = await store.resolveVersionId("refresh-queue-test", "1.0.0"); + const pages = await store.getPagesByVersionId(versionId); + + // All pages should have the necessary fields for refresh operations + for (const page of pages) { + expect(page.id).toBeDefined(); + expect(page.url).toBeDefined(); + expect(page.depth).toBeDefined(); + // etag can be null, but the field should exist + expect(page).toHaveProperty("etag"); + } + }); + }); }); diff --git a/test/refresh-pipeline-e2e.test.ts b/test/refresh-pipeline-e2e.test.ts index d2f069e9..a42fd319 100644 --- a/test/refresh-pipeline-e2e.test.ts +++ b/test/refresh-pipeline-e2e.test.ts @@ -11,7 +11,8 @@ */ import nock from "nock"; -import { afterEach, beforeEach, describe, expect, it } from "vitest"; +import { vol } from "memfs"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { PipelineManager } from "../src/pipeline/PipelineManager"; import { ScraperService } from "../src/scraper/ScraperService"; import type { ScraperOptions } from "../src/scraper/types"; @@ -20,6 +21,9 @@ import { DocumentStore } from "../src/store/DocumentStore"; import type { StoreSearchResult } from "../src/store/types"; import { ScraperRegistry } from "../src/scraper"; +// Mock file system for file-based tests +vi.mock("node:fs/promises", () => ({ default: vol.promises })); + describe("Refresh Pipeline E2E Tests", () => { let docService: DocumentManagementService; let scraperService: ScraperService; @@ -48,6 +52,7 @@ describe("Refresh Pipeline E2E Tests", () => { await pipelineManager.stop(); await docService.shutdown(); nock.cleanAll(); + vol.reset(); }); describe("Refresh Scenarios", () => { @@ -440,4 +445,323 @@ describe("Refresh Pipeline E2E Tests", () => { expect(urls).toContain(`${TEST_BASE_URL}/page2`); }, 30000); }); + + describe("Resiliency", () => { + it("should handle network timeouts gracefully and continue processing other pages", async () => { + // Setup: Mock initial site where one page times out + nock(TEST_BASE_URL) + .get("/") + .reply( + 200, + "

Home

Page 1Timeout", + { + "Content-Type": "text/html", + ETag: '"home-v1"', + }, + ) + .get("/page1") + .reply(200, "

Page 1

Working page

", { + "Content-Type": "text/html", + ETag: '"page1-v1"', + }) + .get("/timeout-page") + .delayConnection(30000) // Simulate timeout + .reply(200, "Should never reach this"); + + // Execute scrape - should complete despite timeout + const jobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + await pipelineManager.waitForJobCompletion(jobId); + + // Verify that the working pages were indexed despite the timeout + const versionId = await docService.ensureVersion({ + library: TEST_LIBRARY, + version: TEST_VERSION, + }); + const pages = await docService.getPagesByVersionId(versionId); + + // Should have home and page1, but timeout-page should have failed + expect(pages.length).toBeGreaterThanOrEqual(2); + const urls = pages.map((p) => p.url); + expect(urls).toContain(`${TEST_BASE_URL}/`); + expect(urls).toContain(`${TEST_BASE_URL}/page1`); + + // Verify working page content is searchable + const search = await docService.searchStore(TEST_LIBRARY, TEST_VERSION, "working page", 10); + expect(search.length).toBeGreaterThan(0); + }, 60000); + + it("should follow redirects and use the final URL for indexing", async () => { + // Setup: Mock site with redirect + nock(TEST_BASE_URL) + .get("/") + .reply( + 200, + "

Home

Old Link", + { + "Content-Type": "text/html", + ETag: '"home-v1"', + }, + ) + .get("/old-url") + .reply(301, undefined, { + Location: `${TEST_BASE_URL}/new-url`, + }) + .get("/new-url") + .reply(200, "

New Page

Redirected content

", { + "Content-Type": "text/html", + ETag: '"new-v1"', + }); + + // Execute scrape + const jobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + await pipelineManager.waitForJobCompletion(jobId); + + // Verify pages were indexed + const versionId = await docService.ensureVersion({ + library: TEST_LIBRARY, + version: TEST_VERSION, + }); + const pages = await docService.getPagesByVersionId(versionId); + + // Should have indexed with the final (redirected) URL + const urls = pages.map((p) => p.url); + expect(urls).toContain(`${TEST_BASE_URL}/`); + expect(urls).toContain(`${TEST_BASE_URL}/new-url`); + + // Verify content from redirected page is searchable + const search = await docService.searchStore( + TEST_LIBRARY, + TEST_VERSION, + "redirected content", + 10, + ); + expect(search.length).toBeGreaterThan(0); + }, 30000); + + it("should handle redirect chains during refresh and update canonical URLs", async () => { + // Setup: Initial scrape with direct URL + nock(TEST_BASE_URL) + .get("/") + .reply(200, "

Home

Page", { + "Content-Type": "text/html", + ETag: '"home-v1"', + }) + .get("/page1") + .reply(200, "

Page 1

Original location

", { + "Content-Type": "text/html", + ETag: '"page1-v1"', + }); + + const initialJobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + await pipelineManager.waitForJobCompletion(initialJobId); + + // Setup: Refresh where page1 now redirects to a new location + nock(TEST_BASE_URL) + .get("/") + .reply(304, undefined, { ETag: '"home-v1"' }) + .get("/page1") + .reply(301, undefined, { + Location: `${TEST_BASE_URL}/page1-new`, + }) + .get("/page1-new") + .reply(200, "

Page 1 New

New location

", { + "Content-Type": "text/html", + ETag: '"page1-new-v1"', + }); + + // Execute refresh + const refreshJobId = await pipelineManager.enqueueRefreshJob(TEST_LIBRARY, TEST_VERSION); + await pipelineManager.waitForJobCompletion(refreshJobId); + + // Verify the canonical URL was updated + const versionId = await docService.ensureVersion({ + library: TEST_LIBRARY, + version: TEST_VERSION, + }); + const pages = await docService.getPagesByVersionId(versionId); + const urls = pages.map((p) => p.url); + + // Should now have the new URL + expect(urls).toContain(`${TEST_BASE_URL}/page1-new`); + + // Verify content from new location is searchable + const search = await docService.searchStore(TEST_LIBRARY, TEST_VERSION, "new location", 10); + expect(search.length).toBeGreaterThan(0); + }, 30000); + }); + + describe("File-Based Refresh Scenarios", () => { + const TEST_FILE_BASE = "/test-docs"; + const TEST_FILE_LIBRARY = "file-lib"; + const TEST_FILE_VERSION = "1.0.0"; + + beforeEach(() => { + vol.reset(); + }); + + it("should detect new files, modified files, and deleted files during refresh", async () => { + // Setup: Create initial file structure + vol.fromJSON({ + [`${TEST_FILE_BASE}/index.md`]: "# Home\nWelcome to the docs", + [`${TEST_FILE_BASE}/guide.md`]: "# Guide\nOriginal guide content", + [`${TEST_FILE_BASE}/api.md`]: "# API\nAPI documentation", + }); + + // Initial scrape - point to the directory to discover all files + const initialJobId = await pipelineManager.enqueueScrapeJob( + TEST_FILE_LIBRARY, + TEST_FILE_VERSION, + { + url: `file://${TEST_FILE_BASE}`, + library: TEST_FILE_LIBRARY, + version: TEST_FILE_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions, + ); + + await pipelineManager.waitForJobCompletion(initialJobId); + + // Verify initial files were indexed + const versionId = await docService.ensureVersion({ + library: TEST_FILE_LIBRARY, + version: TEST_FILE_VERSION, + }); + const initialPages = await docService.getPagesByVersionId(versionId); + expect(initialPages.length).toBe(3); // index.md, guide.md, api.md + + // Modify file structure: + // 1. Delete api.md + // 2. Modify guide.md + // 3. Add new tutorial.md + vol.reset(); + vol.fromJSON({ + [`${TEST_FILE_BASE}/index.md`]: "# Home\nWelcome to the docs", + [`${TEST_FILE_BASE}/guide.md`]: "# Guide\nUpdated guide content with new information", + [`${TEST_FILE_BASE}/tutorial.md`]: "# Tutorial\nStep-by-step tutorial", + }); + + // Wait a bit to ensure file modification times change + await new Promise((resolve) => setTimeout(resolve, 100)); + + // Execute refresh + const refreshJobId = await pipelineManager.enqueueRefreshJob( + TEST_FILE_LIBRARY, + TEST_FILE_VERSION, + ); + await pipelineManager.waitForJobCompletion(refreshJobId); + + // Verify final state + const finalPages = await docService.getPagesByVersionId(versionId); + const finalUrls = finalPages.map((p) => p.url); + + // Should have index, guide, tutorial (but not api) + expect(finalPages.length).toBe(3); + expect(finalUrls).toContain(`file://${TEST_FILE_BASE}/index.md`); + expect(finalUrls).toContain(`file://${TEST_FILE_BASE}/guide.md`); + expect(finalUrls).toContain(`file://${TEST_FILE_BASE}/tutorial.md`); + expect(finalUrls).not.toContain(`file://${TEST_FILE_BASE}/api.md`); + + // Verify modified content is searchable + const modifiedSearch = await docService.searchStore( + TEST_FILE_LIBRARY, + TEST_FILE_VERSION, + "updated guide content", + 10, + ); + expect(modifiedSearch.length).toBeGreaterThan(0); + + // Verify new file content is searchable + const newSearch = await docService.searchStore( + TEST_FILE_LIBRARY, + TEST_FILE_VERSION, + "step-by-step tutorial", + 10, + ); + expect(newSearch.length).toBeGreaterThan(0); + + // Verify deleted file content is no longer searchable + const deletedSearch = await docService.searchStore( + TEST_FILE_LIBRARY, + TEST_FILE_VERSION, + "API documentation", + 10, + ); + const hasDeletedContent = deletedSearch.some( + (r: StoreSearchResult) => r.url === `file://${TEST_FILE_BASE}/api.md`, + ); + expect(hasDeletedContent).toBe(false); + }, 30000); + + it("should handle unchanged files efficiently during file-based refresh", async () => { + // Setup: Create file structure + vol.fromJSON({ + [`${TEST_FILE_BASE}/doc1.md`]: "# Document 1\nStable content", + [`${TEST_FILE_BASE}/doc2.md`]: "# Document 2\nStable content", + }); + + // Initial scrape - point to the directory to discover all files + const initialJobId = await pipelineManager.enqueueScrapeJob( + TEST_FILE_LIBRARY, + TEST_FILE_VERSION, + { + url: `file://${TEST_FILE_BASE}`, + library: TEST_FILE_LIBRARY, + version: TEST_FILE_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions, + ); + + await pipelineManager.waitForJobCompletion(initialJobId); + + const versionId = await docService.ensureVersion({ + library: TEST_FILE_LIBRARY, + version: TEST_FILE_VERSION, + }); + const initialPages = await docService.getPagesByVersionId(versionId); + const initialPageCount = initialPages.length; + + // Execute refresh without modifying files + const refreshJobId = await pipelineManager.enqueueRefreshJob( + TEST_FILE_LIBRARY, + TEST_FILE_VERSION, + ); + await pipelineManager.waitForJobCompletion(refreshJobId); + + // Verify page count hasn't changed + const finalPages = await docService.getPagesByVersionId(versionId); + expect(finalPages.length).toBe(initialPageCount); + + // Verify content is still searchable + const search = await docService.searchStore( + TEST_FILE_LIBRARY, + TEST_FILE_VERSION, + "stable content", + 10, + ); + expect(search.length).toBeGreaterThan(0); + }, 30000); + }); }); From b49f6ee1d33b6559891183cada80e3eab6ac4a5f Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Tue, 11 Nov 2025 05:23:14 -0800 Subject: [PATCH 10/20] refactor(docs): update testing philosophy and guidelines for clarity and focus --- AGENTS.md | 46 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 5d39610f..b8177854 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -101,16 +101,46 @@ ### Test Files -- Create unit test files alongside source files with `.test.ts` suffix -- Run individual TypeScript files: `npx vite-node ` +- Unit tests: alongside source files with `.test.ts` suffix +- E2E tests: in `test/` directory with `*-e2e.test.ts` suffix +- Run: `npx vite-node ` -### Test Strategy +### Testing Philosophy -- Prioritize high-value, low-effort tests -- Test intended behavior, not implementation details -- Defer complex mocking, state management testing, and concurrent processing unless explicitly requested -- Avoid timing-sensitive tests unless absolutely necessary -- Balance maintainability with test coverage +**Core Principle**: Test observable behavior (contracts), not implementation details. + +**Test the "what", not the "how"**: + +- ✅ "File change detection returns SUCCESS for modified files" (observable behavior) +- ❌ "ETag generated from mtime timestamp" (implementation detail) + +**Prefer integration over isolation**: + +- E2E tests > Integration tests > Unit tests +- Default to E2E for new features (highest confidence) +- Add integration tests when components don't interact correctly +- Add unit tests only for complex logic requiring detailed verification + +**What to test**: + +- Public contracts and API boundaries +- Integration points between components +- Complete workflows end-to-end +- Critical business logic + +**What to skip**: + +- Private methods and internal state +- Simple getters/setters and obvious mappings +- Trivial parameter validation +- Implementation-specific details (algorithms, data structures) + +**Quality markers**: + +- Fast: unit tests <100ms, suite <5s +- Focused: one behavior per test +- Maintainable: refactoring doesn't break tests unless behavior changes +- Realistic: tests reflect actual usage patterns ## Git Workflow From 0eba2842bbae9b38c3e1945f7b06e9cffa9302c5 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Tue, 11 Nov 2025 05:50:39 -0800 Subject: [PATCH 11/20] refactor(tests): remove outdated refresh testing documentation --- docs/refresh-testing-prd.md | 290 ------------------------------------ 1 file changed, 290 deletions(-) delete mode 100644 docs/refresh-testing-prd.md diff --git a/docs/refresh-testing-prd.md b/docs/refresh-testing-prd.md deleted file mode 100644 index 2e12988e..00000000 --- a/docs/refresh-testing-prd.md +++ /dev/null @@ -1,290 +0,0 @@ -# Test Refactoring Implementation Plan - -## Overview - -This document serves as a comprehensive to-do list for refactoring all unit tests to follow a behavior-driven testing philosophy. The focus is on validating public contracts and observable outcomes, rather than internal implementation details. - -## Testing Philosophy - -- **Behavior-Driven**: Tests validate the public contract of a component. We test _what_ it does, not _how_ it does it. -- **Consolidate and Elevate**: We favor integration tests that cover a complete workflow over multiple granular unit tests. -- **Clarity of Purpose**: Tests are separated into **Unit/Integration** (verifying component behavior) and **E2E** (verifying complete system workflows). -- **Avoid Implementation Details**: Don't test how something is implemented. Test the observable behavior. - ---- - -## Implementation Checklist - -### Phase 1: DocumentStore (`src/store/DocumentStore.test.ts`) - -#### Refresh-Related Tests - -- [x] **REMOVE** - `describe("Refresh Operations - deletePage", ...)` block - - - **Rationale**: This is an implementation detail better tested at the `PipelineWorker` level - - **Files to update**: `src/store/DocumentStore.test.ts` - -- [x] **KEEP** - `describe("Refresh Operations - getPagesByVersionId", ...)` block - - **Rationale**: Tests the public contract for building refresh queues - - **Action**: No changes needed - -#### Non-Refresh Tests to Refine - -- [x] **REFINED** - "Embedding Batch Processing" tests - - - **Action**: Refactored to test observable behavior (documents are successfully embedded and searchable) rather than implementation details (exact batch sizes) - - **Changes made**: Replaced test that checked exact batch counts with test that verifies all documents are embedded and searchable - - **Files updated**: `src/store/DocumentStore.test.ts` - - **Status**: All 29 tests passing - -- [x] **KEPT** - "Hybrid Search" and "FTS-only Search" tests - - - **Rationale**: These test the quality and correctness of search results (observable behavior) - - **Status**: No changes needed - -- [x] **KEPT** - Core contract tests (storage, retrieval, versioning) - - **Rationale**: Well-structured behavior-driven tests - - **Status**: No changes needed - ---- - -### Phase 2: HttpFetcher (`src/scraper/fetcher/HttpFetcher.test.ts`) - -#### Refresh-Related Tests to Consolidate - -- [x] **CONSOLIDATE** - Conditional request header tests - - - **Current**: Multiple scattered tests for `If-None-Match` header - - **Target**: Two clear tests as specified in the plan - - **Files to update**: `src/scraper/fetcher/HttpFetcher.test.ts` - - **Completed**: Consolidated from 3 tests → 2 tests - -- [x] **CONSOLIDATE** - 304 response handling tests - - - **Current**: Multiple tests for 304 behavior - - **Target**: Consolidate into focused behavior tests - - **Files to update**: `src/scraper/fetcher/HttpFetcher.test.ts` - - **Completed**: Consolidated from 3 tests → 1 test (correctly mocked as success, not error) - -- [x] **CONSOLIDATE** - ETag extraction tests - - **Current**: Multiple tests for ETag formats - - **Target**: Single test with multiple format examples - - **Files to update**: `src/scraper/fetcher/HttpFetcher.test.ts` - - **Completed**: Consolidated from 2 tests → 1 test - -#### Non-Refresh Tests to Consolidate - -- [x] **CONSOLIDATE** - Retry logic tests - - - **Current**: One test per status code (429, 500, 503, etc.) - - **Target**: Two primary tests: - - One for retryable statuses `[408, 429, 500, 502, 503, 504, 525]` - - One for non-retryable statuses `[400, 401, 403, 405, 410]` - - **Files to update**: `src/scraper/fetcher/HttpFetcher.test.ts` - - **Completed**: Consolidated from 5 tests → 2 tests - -- [x] **KEEP** - Cancellation and redirect handling tests - - **Rationale**: Excellent examples of testing observable behavior - - **Action**: No changes needed - - **Result**: All 31 tests passing - ---- - -### Phase 3: FileFetcher (`src/scraper/fetcher/FileFetcher.test.ts`) - -#### Refresh-Related Tests - -- [x] **REMOVE** - "Mtime-based ETag generation" tests - - - **Rationale**: Implementation detail (how ETags are generated) - - **Tests to remove**: - - "should generate ETag from file mtime" - - "should return same ETag for unchanged files" - - "should return different ETag when file is modified" - - **Files to update**: `src/scraper/fetcher/FileFetcher.test.ts` - - **Completed**: Removed 3 implementation detail tests - -- [x] **CONSOLIDATE** - "File status detection for refresh" tests - - **Current**: Multiple granular tests - - **Target**: Four core behavioral tests: - - "should return NOT_MODIFIED when fetching an unchanged file with its etag" - - "should return SUCCESS when fetching a modified file with its old etag" - - "should return NOT_FOUND when the file has been deleted" - - "should return SUCCESS when fetching a new file without an etag" - - **Files to update**: `src/scraper/fetcher/FileFetcher.test.ts` - - **Completed**: Consolidated from 6 tests → 4 focused behavior tests - - **Result**: All 15 tests passing - -#### Non-Refresh Tests to Consolidate - -- [x] **CONSOLIDATED** - MIME type detection tests - - **Previous**: Single large test checking all file types inline - - **Current**: Parameterized test using `it.each` with file extension to MIME type mapping - - **Benefits**: - - Better test output (13 individual test cases vs 1 monolithic test) - - Each file type tested independently - - Easy to add new file types - - Clear test names showing exactly what's being tested - - **Files updated**: `src/scraper/fetcher/FileFetcher.test.ts` - - **Status**: All 27 tests passing (15 baseline + 4 refresh + 8 other tests) - - **Note**: Converted from 1 test with 13 inline checks → 13 parameterized tests - ---- - -### Phase 4: BaseScraperStrategy (`src/scraper/strategies/BaseScraperStrategy.test.ts`) - -#### Refresh-Related Tests to Add - -- [x] **ALREADY PRESENT** - `describe("Refresh mode with initialQueue", ...)` - - - **Tests present**: - - "should prioritize initialQueue items before discovering new links" - - "should preserve pageId from initialQueue items" - - "should preserve etag from initialQueue items" - - "should not duplicate root URL if already in initialQueue" - - **Files**: `src/scraper/strategies/BaseScraperStrategy.test.ts` - - **Status**: All 4 tests passing - -- [x] **ALREADY PRESENT** - `describe("Page counting with different fetch statuses", ...)` - - - **Tests present**: - - "should count pages that return 200 OK" - - "should count pages that return 304 Not Modified" - - "should count pages that return 404 Not Found" - - **Files**: `src/scraper/strategies/BaseScraperStrategy.test.ts` - - **Status**: All 3 tests passing - - **Note**: Removed 1 test checking implementation details (totalPages calculation) - -- [x] **ALREADY PRESENT** - `describe("Progress callbacks with different statuses", ...)` - - **Tests present**: - - "should call progressCallback with result=null for 304 responses" - - "should call progressCallback with deleted=true for 404 responses" - - "should include pageId in progress for refresh operations" - - **Files**: `src/scraper/strategies/BaseScraperStrategy.test.ts` - - **Status**: All 3 tests passing - -#### Non-Refresh Tests to Refine - -- [x] **ALREADY WELL-STRUCTURED** - URL filtering tests - - - **Current**: Well-organized tests covering all scenarios - - **Tests present**: 6 tests covering include/exclude with glob/regex patterns - - **Status**: All tests passing, no changes needed - -- [x] **KEEP** - Core crawling tests - - **Tests to keep**: - - maxPages and maxDepth enforcement - - URL deduplication - - Breadth-first search ordering - - **Rationale**: Excellent behavior-driven tests - - **Action**: No changes needed - - **Status**: All tests passing - ---- - -### Phase 5: PipelineWorker (`src/pipeline/PipelineWorker.test.ts`) - -#### Tests Added - -- [x] **ADDED** - `describe("Database operations based on fetch status", ...)` - - **Tests added**: - - "should perform NO database writes for a 304 Not Modified status" - - "should DELETE existing documents and INSERT new ones for a 200 OK status on an existing page" - - "should INSERT new documents for a 200 OK status on a new page" - - "should call deletePage for a 404 Not Found status" - - **Rationale**: This is the critical integration point where HTTP status codes translate to database state - - **Files updated**: `src/pipeline/PipelineWorker.test.ts` - - **Status**: All 4 new tests passing (10 total tests in file) - ---- - -### Phase 6: E2E Tests (`test/refresh-pipeline-e2e.test.ts`) - -#### Tests Added - -- [x] **ADDED** - Multi-status refresh scenarios - - - ✅ "should delete documents when a page returns 404 during refresh" - - ✅ "should update documents when a page has changed content during refresh" - - ✅ "should skip processing when pages return 304 Not Modified" - - ✅ "should discover and index new pages during refresh" - -- [x] **ADDED** - File-based refresh scenarios - - - ✅ "should detect new files, modified files, and deleted files during refresh" - - ✅ "should handle unchanged files efficiently during file-based refresh" - -- [x] **ADDED** - Standard scrape error handling - - - ✅ "should gracefully handle 404 errors for broken links during normal scraping" - - ✅ "should continue scraping after encountering multiple 404 errors" - -- [x] **ADDED** - Edge cases & resiliency - - ✅ "should handle network timeouts gracefully and continue processing other pages" - - ✅ "should follow redirects and use the final URL for indexing" - - ✅ "should handle redirect chains during refresh and update canonical URLs" - -**Status**: All 11 E2E tests passing. Complete end-to-end validation of refresh pipeline functionality using `nock` for HTTP mocking and `memfs` for file system mocking. - -**Key Fixes Made**: - -- Fixed `PipelineManager.enqueueRefreshJob` to not override `maxPages` from stored options -- Fixed file-based test to properly reset `memfs` volume before modifying file structure -- All tests now use mocked responses (no real network calls or timeouts) - ---- - -## Implementation Approach - -### Step-by-Step Process - -1. **Start with removals** - Clean up implementation detail tests first -2. **Then consolidate** - Combine similar tests into more powerful versions -3. **Finally add** - Implement missing behavioral tests -4. **Verify** - Run full test suite after each phase - -### For Each Test File - -1. Review the current test structure -2. Identify tests that match the "remove" or "consolidate" criteria -3. Make changes incrementally -4. Run tests after each change to ensure nothing breaks -5. Commit changes with clear messages - ---- - -## Success Criteria - -- [ ] **All tests are behavior-driven** - No implementation details tested -- [ ] **Unit tests run fast** - Component tests complete in <5 seconds total -- [ ] **E2E tests are comprehensive** - Complete workflows validated end-to-end -- [ ] **Tests are maintainable** - Clear, focused, easy to update -- [ ] **Full test coverage** - All public contracts have tests - ---- - -## What Makes a Good Test? - -### ✅ Good Tests - -- Test observable behavior: "File change detection returns SUCCESS for modified files" -- Test the contract: "404 status results in page deletion" -- Test integration points: "PipelineWorker correctly translates status codes to database operations" -- Use realistic scenarios: "Refresh with mix of 304, 200, and 404 responses" - -### ❌ Bad Tests - -- Test implementation details: "ETag is generated from mtime timestamp" -- Test internal state: "Queue contains exactly N items" -- Test trivial behavior: "Function returns the value it was given" -- Over-mock: Mocking every dependency makes tests fragile and meaningless - ---- - -## References - -- Existing E2E tests: `test/refresh-pipeline-e2e.test.ts` -- Refresh architecture: `docs/refresh-architecture.md` -- Strategy unit tests: `src/scraper/strategies/*.test.ts` -- Fetcher unit tests: `src/scraper/fetcher/*.test.ts` -- Store unit tests: `src/store/DocumentStore.test.ts` From e405d36bdb03cf6e3f47efecbfd2704a5fc4607c Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Tue, 11 Nov 2025 06:46:59 -0800 Subject: [PATCH 12/20] refactor(scraper): enhance file inclusion logic with MIME type checks refactor(store): improve error handling in findParentChunk method refactor(assembly): streamline parent chunk lookup process fix(tools): correct Etag to ETag capitalization in RefreshVersionTool documentation refactor(tests): remove unused DocumentStore import in refresh pipeline tests --- .../strategies/GitHubScraperStrategy.ts | 16 ++++- src/store/DocumentManagementService.ts | 1 - src/store/DocumentStore.ts | 15 +++-- .../HierarchicalAssemblyStrategy.ts | 58 +++++-------------- src/tools/RefreshVersionTool.ts | 2 +- test/refresh-pipeline-e2e.test.ts | 2 - 6 files changed, 37 insertions(+), 57 deletions(-) diff --git a/src/scraper/strategies/GitHubScraperStrategy.ts b/src/scraper/strategies/GitHubScraperStrategy.ts index 42a7438e..72589da5 100644 --- a/src/scraper/strategies/GitHubScraperStrategy.ts +++ b/src/scraper/strategies/GitHubScraperStrategy.ts @@ -1,3 +1,4 @@ +import mime from "mime"; import type { ProgressCallback } from "../../types"; import { logger } from "../../utils/logger"; import { HttpFetcher } from "../fetcher"; @@ -297,11 +298,20 @@ export class GitHubScraperStrategy extends BaseScraperStrategy { return fileNameLower === name || fileNameLower.startsWith(`${name}.`); }); - if (!hasTextExtension && !hasCompoundExtension && !isCommonTextFile) { - return false; + // If file passes known checks, include it + if (hasTextExtension || hasCompoundExtension || isCommonTextFile) { + return shouldIncludeUrl(path, options.includePatterns, options.excludePatterns); + } + + // Fallback: check if unknown extension has text/* MIME type + const mimeType = mime.getType(path); + if (mimeType?.startsWith("text/")) { + logger.debug(`Including file with text MIME type: ${path} (${mimeType})`); + return shouldIncludeUrl(path, options.includePatterns, options.excludePatterns); } - return shouldIncludeUrl(path, options.includePatterns, options.excludePatterns); + // Not a text file + return false; } /** diff --git a/src/store/DocumentManagementService.ts b/src/store/DocumentManagementService.ts index cfdfe671..7f660cd0 100644 --- a/src/store/DocumentManagementService.ts +++ b/src/store/DocumentManagementService.ts @@ -438,7 +438,6 @@ export class DocumentManagementService { // Document characteristics hasTitle: !!title, - // hasDescription: !!processed.metadata.description, urlDomain: extractHostname(url), depth, diff --git a/src/store/DocumentStore.ts b/src/store/DocumentStore.ts index 5493213c..a6c0e022 100644 --- a/src/store/DocumentStore.ts +++ b/src/store/DocumentStore.ts @@ -682,11 +682,11 @@ export class DocumentStore { // Extract source URL and exclude runtime-only fields using destructuring const { url: source_url, - library: _, - version: __, - signal: ___, - initialQueue: ____, - isRefresh: _____, + library: _library, + version: _version, + signal: _signal, + initialQueue: _initialQueue, + isRefresh: _isRefresh, ...scraper_options } = options; @@ -1463,6 +1463,8 @@ export class DocumentStore { /** * Finds the parent chunk of a given document. + * Returns null if no parent is found or if there's a database error. + * Database errors are logged but not thrown to maintain consistent behavior. */ async findParentChunk( library: string, @@ -1497,7 +1499,8 @@ export class DocumentStore { return this.parseMetadata(result); } catch (error) { - throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error); + logger.warn(`Failed to find parent chunk for ID ${id}: ${error}`); + return null; } } diff --git a/src/store/assembly/strategies/HierarchicalAssemblyStrategy.ts b/src/store/assembly/strategies/HierarchicalAssemblyStrategy.ts index 0407361d..a2ed5013 100644 --- a/src/store/assembly/strategies/HierarchicalAssemblyStrategy.ts +++ b/src/store/assembly/strategies/HierarchicalAssemblyStrategy.ts @@ -220,47 +220,21 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { chainIds.push(currentId); depth++; - try { - // Try normal parent lookup first - const parentChunk = await documentStore.findParentChunk( + // Try normal parent lookup first + let parentChunk = await documentStore.findParentChunk(library, version, currentId); + + // If no direct parent found, try gap-aware ancestor search + if (!parentChunk) { + parentChunk = await this.findAncestorWithGaps( library, version, - currentId, + currentChunk.url, + currentChunk.metadata.path ?? [], + documentStore, ); - - if (parentChunk) { - currentChunk = parentChunk; - } else { - // If normal parent lookup fails, try to find ancestors with gaps - currentChunk = await this.findAncestorWithGaps( - library, - version, - currentChunk.url, - currentChunk.metadata.path ?? [], - documentStore, - ); - } - } catch (error) { - // If standard lookup fails, try gap-aware ancestor search - try { - if (currentChunk) { - currentChunk = await this.findAncestorWithGaps( - library, - version, - currentChunk.url, - currentChunk.metadata.path ?? [], - documentStore, - ); - } else { - currentChunk = null; - } - } catch (gapError) { - logger.warn( - `Parent lookup failed for chunk ${currentId}: ${error}. Gap search also failed: ${gapError}`, - ); - break; - } } + + currentChunk = parentChunk; } if (depth >= maxDepth) { @@ -612,13 +586,9 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { chunkIds.add(id); // Add parent for context - try { - const parent = await documentStore.findParentChunk(library, version, id); - if (parent) { - chunkIds.add(parent.id); - } - } catch (error) { - logger.warn(`Failed to find parent for chunk ${id}: ${error}`); + const parent = await documentStore.findParentChunk(library, version, id); + if (parent) { + chunkIds.add(parent.id); } // Add direct children (limited) diff --git a/src/tools/RefreshVersionTool.ts b/src/tools/RefreshVersionTool.ts index 14b73ac8..97140529 100644 --- a/src/tools/RefreshVersionTool.ts +++ b/src/tools/RefreshVersionTool.ts @@ -20,7 +20,7 @@ export type RefreshExecuteResult = RefreshResult | { jobId: string }; /** * Tool for refreshing an existing library version by re-scraping all pages - * and using Etag comparison to skip unchanged content. + * and using ETag comparison to skip unchanged content. */ export class RefreshVersionTool { private pipeline: IPipeline; diff --git a/test/refresh-pipeline-e2e.test.ts b/test/refresh-pipeline-e2e.test.ts index a42fd319..e0af9dca 100644 --- a/test/refresh-pipeline-e2e.test.ts +++ b/test/refresh-pipeline-e2e.test.ts @@ -17,7 +17,6 @@ import { PipelineManager } from "../src/pipeline/PipelineManager"; import { ScraperService } from "../src/scraper/ScraperService"; import type { ScraperOptions } from "../src/scraper/types"; import { DocumentManagementService } from "../src/store/DocumentManagementService"; -import { DocumentStore } from "../src/store/DocumentStore"; import type { StoreSearchResult } from "../src/store/types"; import { ScraperRegistry } from "../src/scraper"; @@ -99,7 +98,6 @@ describe("Refresh Pipeline E2E Tests", () => { const page2 = pages.find((p) => p.url === `${TEST_BASE_URL}/page2`); expect(page2).toBeDefined(); - const page2Id = page2!.id; // Setup: Mock refresh with page2 deleted (404) // Enable nock logging to see what requests are made From e8410f3a9410aa7bbda98c091b2cb5f55d1d0653 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Thu, 13 Nov 2025 16:00:54 -0600 Subject: [PATCH 13/20] feat(schema): add database schema comparison tool for migration validation --- package-lock.json | 2 +- package.json | 2 +- scripts/validate-schema.ts | 328 +++++++++++++++++++++++++++++++++++++ 3 files changed, 330 insertions(+), 2 deletions(-) create mode 100755 scripts/validate-schema.ts diff --git a/package-lock.json b/package-lock.json index 8184dbe5..a637c4c5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -91,7 +91,7 @@ "tailwindcss": "^4.1.4", "typescript": "^5.9.3", "vite": "^6.3.5", - "vite-node": "^3.1.2", + "vite-node": "^3.2.4", "vite-plugin-dts": "^4.5.4", "vitest": "^3.2.4" }, diff --git a/package.json b/package.json index 8c3fb58d..70e129db 100644 --- a/package.json +++ b/package.json @@ -119,7 +119,7 @@ "tailwindcss": "^4.1.4", "typescript": "^5.9.3", "vite": "^6.3.5", - "vite-node": "^3.1.2", + "vite-node": "^3.2.4", "vite-plugin-dts": "^4.5.4", "vitest": "^3.2.4" }, diff --git a/scripts/validate-schema.ts b/scripts/validate-schema.ts new file mode 100755 index 00000000..61ea9e48 --- /dev/null +++ b/scripts/validate-schema.ts @@ -0,0 +1,328 @@ +#!/usr/bin/env vite-node + +/** + * Database Schema Comparison Tool + * + * This script compares the database structure from a SQLite database file + * against an in-memory database created by applying all migrations. + * + * Usage: + * ./scripts/compare-schema.ts [--db ] + * + * If --db is not specified, uses the default database location logic. + */ + +import path from "node:path"; +import Database, { type Database as DatabaseType } from "better-sqlite3"; +import * as sqliteVec from "sqlite-vec"; +import { Command } from "commander"; +import { applyMigrations } from "../src/store/applyMigrations"; +import { resolveStorePath } from "../src/utils/paths"; + +// Schema structures +interface ColumnInfo { + cid: number; + name: string; + type: string; + notnull: number; + dflt_value: string | null; + pk: number; +} + +interface IndexInfo { + seq: number; + name: string; + unique: number; + origin: string; + partial: number; +} + +interface TableSchema { + name: string; + type: string; + sql: string | null; + columns: ColumnInfo[]; + indexes: IndexInfo[]; +} + +interface DatabaseSchema { + tables: Map; +} + +/** + * Extracts the complete schema from a database. + */ +function getSchemaDetails(db: DatabaseType): DatabaseSchema { + const schema: DatabaseSchema = { + tables: new Map(), + }; + + // Get all tables (including virtual tables) + const tables = db + .prepare( + "SELECT name, type, sql FROM sqlite_master WHERE type IN ('table', 'view') ORDER BY name", + ) + .all() as Array<{ name: string; type: string; sql: string | null }>; + + for (const table of tables) { + // Skip internal SQLite tables + if (table.name.startsWith("sqlite_")) { + continue; + } + + const tableSchema: TableSchema = { + name: table.name, + type: table.type, + sql: table.sql, + columns: [], + indexes: [], + }; + + // Get column information (won't work for virtual tables, but we'll handle that) + try { + const columns = db.prepare(`PRAGMA table_info(${table.name})`).all() as ColumnInfo[]; + tableSchema.columns = columns; + } catch (error) { + // Virtual tables don't support PRAGMA table_info, we'll just note this + console.log(`Note: Cannot get column info for virtual table: ${table.name}`); + } + + // Get index information + try { + const indexes = db.prepare(`PRAGMA index_list(${table.name})`).all() as IndexInfo[]; + tableSchema.indexes = indexes; + } catch (error) { + // Some virtual tables don't support PRAGMA index_list + console.log(`Note: Cannot get index info for table: ${table.name}`); + } + + schema.tables.set(table.name, tableSchema); + } + + return schema; +} + +/** + * Compares two database schemas and returns a report of differences. + */ +function compareSchemas( + expectedSchema: DatabaseSchema, + actualSchema: DatabaseSchema, +): { isMatch: boolean; differences: string[] } { + const differences: string[] = []; + + // Check for missing or extra tables + const expectedTableNames = new Set(expectedSchema.tables.keys()); + const actualTableNames = new Set(actualSchema.tables.keys()); + + for (const tableName of expectedTableNames) { + if (!actualTableNames.has(tableName)) { + differences.push(`❌ Missing table: ${tableName}`); + } + } + + for (const tableName of actualTableNames) { + if (!expectedTableNames.has(tableName)) { + differences.push(`➕ Extra table: ${tableName}`); + } + } + + // Compare tables that exist in both schemas + for (const tableName of expectedTableNames) { + if (!actualTableNames.has(tableName)) { + continue; // Already reported as missing + } + + const expectedTable = expectedSchema.tables.get(tableName)!; + const actualTable = actualSchema.tables.get(tableName)!; + + // Compare table types + if (expectedTable.type !== actualTable.type) { + differences.push( + `⚠️ Table ${tableName}: type mismatch (expected: ${expectedTable.type}, actual: ${actualTable.type})`, + ); + } + + // Compare SQL definitions for virtual tables + if (expectedTable.type !== "table" && expectedTable.sql !== actualTable.sql) { + differences.push( + `⚠️ Table ${tableName}: SQL definition mismatch\n Expected: ${expectedTable.sql}\n Actual: ${actualTable.sql}`, + ); + } + + // Compare columns (only for regular tables) + if (expectedTable.columns.length > 0 || actualTable.columns.length > 0) { + const expectedColumns = new Map(expectedTable.columns.map((col) => [col.name, col])); + const actualColumns = new Map(actualTable.columns.map((col) => [col.name, col])); + + // Check for missing columns + for (const [colName, expectedCol] of expectedColumns) { + const actualCol = actualColumns.get(colName); + + if (!actualCol) { + differences.push( + `❌ Table ${tableName}: missing column '${colName}' (expected type: ${expectedCol.type})`, + ); + continue; + } + + // Detailed comparison of all column properties + const columnDiffs: string[] = []; + + // Compare column type (case-insensitive, as SQLite is flexible with type names) + if (expectedCol.type.toUpperCase() !== actualCol.type.toUpperCase()) { + columnDiffs.push( + `type mismatch (expected: '${expectedCol.type}', actual: '${actualCol.type}')`, + ); + } + + // Compare NOT NULL constraint + if (expectedCol.notnull !== actualCol.notnull) { + columnDiffs.push( + `NOT NULL mismatch (expected: ${expectedCol.notnull === 1 ? "NOT NULL" : "NULL"}, actual: ${actualCol.notnull === 1 ? "NOT NULL" : "NULL"})`, + ); + } + + // Compare PRIMARY KEY status + if (expectedCol.pk !== actualCol.pk) { + columnDiffs.push( + `PRIMARY KEY mismatch (expected: ${expectedCol.pk === 1 ? "YES" : "NO"}, actual: ${actualCol.pk === 1 ? "YES" : "NO"})`, + ); + } + + // Compare default values (normalize NULL comparisons) + const expectedDefault = expectedCol.dflt_value === null ? "NULL" : expectedCol.dflt_value; + const actualDefault = actualCol.dflt_value === null ? "NULL" : actualCol.dflt_value; + + if (expectedDefault !== actualDefault) { + columnDiffs.push( + `default value mismatch (expected: ${expectedDefault}, actual: ${actualDefault})`, + ); + } + + // Compare column order (cid) + if (expectedCol.cid !== actualCol.cid) { + columnDiffs.push( + `column order mismatch (expected position: ${expectedCol.cid}, actual position: ${actualCol.cid})`, + ); + } + + // Report all differences for this column + if (columnDiffs.length > 0) { + differences.push( + `⚠️ Table ${tableName}, column '${colName}':\n ${columnDiffs.join("\n ")}`, + ); + } + } + + // Check for extra columns + for (const colName of actualColumns.keys()) { + if (!expectedColumns.has(colName)) { + const actualCol = actualColumns.get(colName)!; + differences.push( + `➕ Table ${tableName}: extra column '${colName}' (type: ${actualCol.type}, NOT NULL: ${actualCol.notnull === 1 ? "YES" : "NO"})`, + ); + } + } + } + + // Compare indexes + const expectedIndexNames = new Set(expectedTable.indexes.map((idx) => idx.name)); + const actualIndexNames = new Set(actualTable.indexes.map((idx) => idx.name)); + + for (const indexName of expectedIndexNames) { + if (!actualIndexNames.has(indexName)) { + differences.push(`❌ Table ${tableName}: missing index ${indexName}`); + } + } + + for (const indexName of actualIndexNames) { + if (!expectedIndexNames.has(indexName)) { + differences.push(`➕ Table ${tableName}: extra index ${indexName}`); + } + } + } + + return { + isMatch: differences.length === 0, + differences, + }; +} + +/** + * Main function to orchestrate the schema comparison. + */ +async function main() { + const program = new Command(); + + program + .name("validate-schema") + .description("Validate SQLite database schema against migrations") + .option("--db ", "Path to the target documents.db file") + .parse(process.argv); + + const options = program.opts(); + + // Determine the target database path + let targetDbPath: string; + if (options.db) { + targetDbPath = path.resolve(options.db as string); + } else { + const storePath = resolveStorePath(); + targetDbPath = path.join(storePath, "documents.db"); + } + + console.log("🔍 Database Schema Comparison Tool"); + console.log("================================\n"); + console.log(`Target database: ${targetDbPath}\n`); + + // Create expected schema from migrations + console.log("📝 Creating expected schema from migrations..."); + const expectedDb = new Database(":memory:"); + sqliteVec.load(expectedDb); + + try { + await applyMigrations(expectedDb); + const expectedSchema = getSchemaDetails(expectedDb); + console.log(` Found ${expectedSchema.tables.size} tables in expected schema\n`); + + // Get actual schema from target database + console.log("📂 Reading actual schema from target database..."); + const actualDb = new Database(targetDbPath, { readonly: true }); + sqliteVec.load(actualDb); + + try { + const actualSchema = getSchemaDetails(actualDb); + console.log(` Found ${actualSchema.tables.size} tables in actual schema\n`); + + // Compare schemas + console.log("🔎 Comparing schemas...\n"); + const comparison = compareSchemas(expectedSchema, actualSchema); + + if (comparison.isMatch) { + console.log("✅ SUCCESS: Database schemas match perfectly!"); + console.log(" The database structure is exactly as expected.\n"); + process.exit(0); + } else { + console.log("❌ MISMATCH: Database schemas differ!\n"); + console.log("Differences found:\n"); + for (const diff of comparison.differences) { + console.log(` ${diff}`); + } + console.log(`\nTotal differences: ${comparison.differences.length}\n`); + process.exit(1); + } + } finally { + actualDb.close(); + } + } finally { + expectedDb.close(); + } +} + +// Run the script +main().catch((error) => { + console.error("\n❌ Error:", error.message); + console.error(error.stack); + process.exit(1); +}); From 90fcf1fa0cee3e4785854d267af5c96c36aeebfa Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Thu, 13 Nov 2025 16:13:04 -0600 Subject: [PATCH 14/20] feat: enable source maps for better error reporting --- Dockerfile | 2 +- vite.config.ts | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 42638816..341182b4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -69,4 +69,4 @@ ENV PORT=6280 ENV HOST=0.0.0.0 # Set the command to run the application -ENTRYPOINT ["node", "dist/index.js"] +ENTRYPOINT ["node", "--enable-source-maps", "dist/index.js"] diff --git a/vite.config.ts b/vite.config.ts index ef6324a1..bfb63323 100644 --- a/vite.config.ts +++ b/vite.config.ts @@ -11,8 +11,8 @@ export default defineConfig({ generateBundle(options, bundle) { const indexBundle = bundle['index.js']; if (indexBundle && indexBundle.type === 'chunk' && indexBundle.code) { - // Add shebang to the beginning of the file - indexBundle.code = '#!/usr/bin/env node\n' + indexBundle.code; + // Add shebang to the beginning of the file with source maps enabled + indexBundle.code = '#!/usr/bin/env node --enable-source-maps\n' + indexBundle.code; } }, writeBundle(options) { From b915bcf2c37df0cc9429582a66e773eb9d0e7416 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Thu, 13 Nov 2025 17:03:43 -0600 Subject: [PATCH 15/20] feat: update contentType handling across pipelines to reflect transformed formats --- .../HtmlToMarkdownMiddleware.test.ts | 1 + .../middleware/HtmlToMarkdownMiddleware.ts | 3 ++ src/scraper/pipelines/HtmlPipeline.test.ts | 39 +++++++++++++++++++ src/scraper/pipelines/HtmlPipeline.ts | 1 + src/scraper/pipelines/JsonPipeline.ts | 1 + src/scraper/pipelines/MarkdownPipeline.ts | 1 + src/scraper/pipelines/SourceCodePipeline.ts | 1 + src/scraper/pipelines/TextPipeline.ts | 1 + src/scraper/pipelines/types.ts | 2 + src/scraper/strategies/WebScraperStrategy.ts | 2 +- 10 files changed, 51 insertions(+), 1 deletion(-) diff --git a/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts b/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts index 391f0022..43f5bb1c 100644 --- a/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts +++ b/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts @@ -61,6 +61,7 @@ describe("HtmlToMarkdownMiddleware", () => { expect(context.content).toBe( "# Heading 1\n\nThis is a paragraph with **bold** and _italic_ text.\n\n- Item 1\n- Item 2\n\n[Link](http://link.com)", ); + expect(context.contentType).toBe("text/markdown"); expect(context.errors).toHaveLength(0); // No close needed diff --git a/src/scraper/middleware/HtmlToMarkdownMiddleware.ts b/src/scraper/middleware/HtmlToMarkdownMiddleware.ts index 0b026d40..d52e9651 100644 --- a/src/scraper/middleware/HtmlToMarkdownMiddleware.ts +++ b/src/scraper/middleware/HtmlToMarkdownMiddleware.ts @@ -111,6 +111,9 @@ export class HtmlToMarkdownMiddleware implements ContentProcessorMiddleware { context.content = markdown; logger.debug(`Successfully converted HTML to Markdown for ${context.source}`); } + + // Update contentType to reflect the converted format + context.contentType = "text/markdown"; } catch (error) { logger.error( `❌ Error converting HTML to Markdown for ${context.source}: ${error}`, diff --git a/src/scraper/pipelines/HtmlPipeline.test.ts b/src/scraper/pipelines/HtmlPipeline.test.ts index e8ee3adf..cd009d1b 100644 --- a/src/scraper/pipelines/HtmlPipeline.test.ts +++ b/src/scraper/pipelines/HtmlPipeline.test.ts @@ -271,6 +271,9 @@ describe("HtmlPipeline", () => { expect(result.title).toBe("Test Page"); expect(result.links).toContain("https://example.com/test/link"); + // Verify contentType was updated to markdown after HTML conversion + expect(result.contentType).toBe("text/markdown"); + // Verify the content was sanitized (no script tags) and converted to markdown expect(result.textContent).not.toContain("alert"); expect(result.textContent).toContain("Hello World"); @@ -280,6 +283,42 @@ describe("HtmlPipeline", () => { expect(result.errors).toHaveLength(0); }); + it("should convert contentType from text/html to text/markdown", async () => { + const pipeline = new HtmlPipeline(); + const html = ` + + Mimetype Test + +

Testing Content Type

+

This HTML should be converted to markdown.

+ + + `; + + const raw: RawContent = { + content: html, + mimeType: "text/html", + charset: "utf-8", + source: "http://test.example.com", + status: FetchStatus.SUCCESS, + }; + + const result = await pipeline.process(raw, { + url: "http://example.com", + library: "example", + version: "", + scrapeMode: ScrapeMode.Fetch, + }); + + // Verify contentType was transformed from HTML to Markdown + expect(result.contentType).toBe("text/markdown"); + expect(result.contentType).not.toBe("text/html"); + + // Verify content is in markdown format + expect(result.textContent).toContain("# Testing Content Type"); + expect(result.textContent).toContain("This HTML should be converted to markdown"); + }); + describe("cleanup", () => { it("should call closeBrowser on Playwright middleware when close() is called", async () => { const pipeline = new HtmlPipeline(); diff --git a/src/scraper/pipelines/HtmlPipeline.ts b/src/scraper/pipelines/HtmlPipeline.ts index 9deb9ee9..5d35e5ef 100644 --- a/src/scraper/pipelines/HtmlPipeline.ts +++ b/src/scraper/pipelines/HtmlPipeline.ts @@ -101,6 +101,7 @@ export class HtmlPipeline extends BasePipeline { return { title: context.title, + contentType: context.contentType, textContent: context.content, links: context.links, errors: context.errors, diff --git a/src/scraper/pipelines/JsonPipeline.ts b/src/scraper/pipelines/JsonPipeline.ts index c3abe6a0..26d33e03 100644 --- a/src/scraper/pipelines/JsonPipeline.ts +++ b/src/scraper/pipelines/JsonPipeline.ts @@ -89,6 +89,7 @@ export class JsonPipeline extends BasePipeline { return { title: context.title, + contentType: context.contentType, textContent: context.content, links: context.links, errors: context.errors, diff --git a/src/scraper/pipelines/MarkdownPipeline.ts b/src/scraper/pipelines/MarkdownPipeline.ts index e3302392..7873a083 100644 --- a/src/scraper/pipelines/MarkdownPipeline.ts +++ b/src/scraper/pipelines/MarkdownPipeline.ts @@ -78,6 +78,7 @@ export class MarkdownPipeline extends BasePipeline { return { title: context.title, + contentType: context.contentType, textContent: typeof context.content === "string" ? context.content : "", links: context.links, errors: context.errors, diff --git a/src/scraper/pipelines/SourceCodePipeline.ts b/src/scraper/pipelines/SourceCodePipeline.ts index 8bf892b4..0b8dd07f 100644 --- a/src/scraper/pipelines/SourceCodePipeline.ts +++ b/src/scraper/pipelines/SourceCodePipeline.ts @@ -64,6 +64,7 @@ export class SourceCodePipeline extends BasePipeline { return { title: context.title, + contentType: context.contentType, textContent: context.content, // metadata: context.metadata, links: context.links, diff --git a/src/scraper/pipelines/TextPipeline.ts b/src/scraper/pipelines/TextPipeline.ts index 9591f41a..f8a294f7 100644 --- a/src/scraper/pipelines/TextPipeline.ts +++ b/src/scraper/pipelines/TextPipeline.ts @@ -75,6 +75,7 @@ export class TextPipeline extends BasePipeline { return { title: context.title, + contentType: context.contentType, textContent: context.content, links: context.links, errors: context.errors, diff --git a/src/scraper/pipelines/types.ts b/src/scraper/pipelines/types.ts index 39f67a90..affcf59c 100644 --- a/src/scraper/pipelines/types.ts +++ b/src/scraper/pipelines/types.ts @@ -8,6 +8,8 @@ import type { ScraperOptions } from "../types"; export interface PipelineResult { /** The title of the page or document, extracted during processing */ title?: string | null; + /** The MIME type of the processed content (may differ from input if transformed, e.g., HTML → Markdown) */ + contentType?: string | null; /** The final processed content, typically as a string (e.g., Markdown). */ textContent?: string | null; /** Extracted links from the content. */ diff --git a/src/scraper/strategies/WebScraperStrategy.ts b/src/scraper/strategies/WebScraperStrategy.ts index 702650d6..6a55d2a5 100644 --- a/src/scraper/strategies/WebScraperStrategy.ts +++ b/src/scraper/strategies/WebScraperStrategy.ts @@ -143,7 +143,7 @@ export class WebScraperStrategy extends BaseScraperStrategy { url: rawContent.source, etag: rawContent.etag, lastModified: rawContent.lastModified, - contentType: rawContent.mimeType, + contentType: processed.contentType || rawContent.mimeType, content: processed, links: filteredLinks, status: FetchStatus.SUCCESS, From 45d6d8a3721ca4f821e3063cf4f0b50b0f30bb37 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Thu, 13 Nov 2025 18:05:11 -0600 Subject: [PATCH 16/20] test: refactor tests to centralize logger mocking in setup file - Removed individual logger mocks from various test files. - Created a new setup file to mock the logger globally for all tests. - Updated vitest configuration to include the setup file for consistent logger behavior across tests. --- src/app/AppServer.test.ts | 73 +++++++------------ src/cli/index.test.ts | 1 - src/index.test.ts | 11 --- src/pipeline/PipelineClient.test.ts | 2 - src/pipeline/PipelineFactory.test.ts | 1 - src/pipeline/PipelineManager.test.ts | 1 - src/pipeline/PipelineWorker.test.ts | 1 - src/scraper/ScraperRegistry.test.ts | 2 - src/scraper/ScraperService.test.ts | 2 - src/scraper/fetcher/FileFetcher.test.ts | 1 - src/scraper/fetcher/HttpFetcher.test.ts | 1 - .../HtmlCheerioParserMiddleware.test.ts | 1 - .../HtmlJsExecutorMiddleware.test.ts | 3 - .../HtmlLinkExtractorMiddleware.test.ts | 1 - .../HtmlMetadataExtractorMiddleware.test.ts | 1 - .../HtmlPlaywrightMiddleware.test.ts | 1 - .../HtmlSanitizerMiddleware.test.ts | 1 - .../HtmlToMarkdownMiddleware.test.ts | 1 - .../MarkdownLinkExtractorMiddleware.test.ts | 1 - ...arkdownMetadataExtractorMiddleware.test.ts | 1 - .../pipelines/HtmlPipeline.charset.test.ts | 12 +-- .../strategies/BaseScraperStrategy.test.ts | 1 - .../strategies/LocalFileStrategy.test.ts | 1 - .../strategies/WebScraperStrategy.test.ts | 1 - src/scraper/utils/sandbox.test.ts | 9 --- src/splitter/GreedySplitter.test.ts | 2 - src/splitter/SemanticMarkdownSplitter.test.ts | 4 +- .../splitters/CodeContentSplitter.test.ts | 4 +- .../splitters/TableContentSplitter.test.ts | 4 +- .../splitters/TextContentSplitter.test.ts | 4 +- src/store/DocumentManagementService.test.ts | 1 - src/store/DocumentRetrieverService.test.ts | 1 - src/store/embeddings/EmbeddingFactory.test.ts | 1 - .../FixedDimensionEmbeddings.test.ts | 3 +- src/telemetry/analytics.test.ts | 7 -- src/tools/CancelJobTool.test.ts | 1 - src/tools/ClearCompletedJobsTool.test.ts | 1 - src/tools/FetchUrlTool.test.ts | 1 - src/tools/FindVersionTool.test.ts | 1 - src/tools/GetJobInfoTool.test.ts | 1 - src/tools/ListJobsTool.test.ts | 1 - src/tools/ListLibrariesTool.test.ts | 1 - src/tools/RemoveTool.test.ts | 1 - src/tools/ScrapeTool.test.ts | 1 - src/tools/SearchTool.test.ts | 1 - src/utils/url.test.ts | 4 +- test/setup.ts | 3 + test/vitest.config.ts | 1 + vite.config.ts | 1 + 49 files changed, 38 insertions(+), 143 deletions(-) create mode 100644 test/setup.ts diff --git a/src/app/AppServer.test.ts b/src/app/AppServer.test.ts index d7626383..79e751e9 100644 --- a/src/app/AppServer.test.ts +++ b/src/app/AppServer.test.ts @@ -36,13 +36,6 @@ const mockWorkerService = vi.hoisted(() => ({ stopWorkerService: vi.fn(), })); -const mockLogger = vi.hoisted(() => ({ - info: vi.fn(), - warn: vi.fn(), - error: vi.fn(), - debug: vi.fn(), -})); - // Apply mocks using hoisted values vi.mock("fastify", () => ({ default: vi.fn(() => mockFastify), @@ -52,7 +45,6 @@ vi.mock("../services/mcpService", () => mockMcpService); vi.mock("../services/trpcService", () => mockTrpcService); vi.mock("../services/webService", () => mockWebService); vi.mock("../services/workerService", () => mockWorkerService); -vi.mock("../utils/logger", () => ({ logger: mockLogger })); vi.mock("../utils/paths", () => ({ getProjectRoot: vi.fn(() => "/mock/project/root"), })); @@ -399,7 +391,7 @@ describe("AppServer Behavior Tests", () => { expect(fastifyInstance).toBe(mockFastify); }); - it("should log startup information with enabled services", async () => { + it("should successfully start server with all services enabled", async () => { const config: AppServerConfig = { enableWebInterface: true, enableMcpServer: true, @@ -415,24 +407,23 @@ describe("AppServer Behavior Tests", () => { config, ); - await server.start(); + const fastifyInstance = await server.start(); - expect(mockLogger.info).toHaveBeenCalledWith( - expect.stringContaining("AppServer available at"), - ); - expect(mockLogger.info).toHaveBeenCalledWith( - expect.stringContaining("Web interface:"), - ); - expect(mockLogger.info).toHaveBeenCalledWith( - expect.stringContaining("MCP endpoints:"), - ); - expect(mockLogger.info).toHaveBeenCalledWith(expect.stringContaining("API:")); - expect(mockLogger.info).toHaveBeenCalledWith( - expect.stringContaining("Embedded worker:"), - ); + // Verify server started successfully + expect(fastifyInstance).toBe(mockFastify); + expect(mockFastify.listen).toHaveBeenCalledWith({ + port: 3000, + host: "127.0.0.1", + }); + + // Verify all services were registered + expect(mockWebService.registerWebService).toHaveBeenCalled(); + expect(mockMcpService.registerMcpService).toHaveBeenCalled(); + expect(mockTrpcService.registerTrpcService).toHaveBeenCalled(); + expect(mockWorkerService.registerWorkerService).toHaveBeenCalled(); }); - it("should log external worker URL when configured", async () => { + it("should successfully start server with external worker configured", async () => { const config: AppServerConfig = { enableWebInterface: true, enableMcpServer: false, @@ -449,11 +440,15 @@ describe("AppServer Behavior Tests", () => { config, ); - await server.start(); + const fastifyInstance = await server.start(); - expect(mockLogger.info).toHaveBeenCalledWith( - expect.stringContaining("External worker: http://external-worker:8080"), - ); + // Verify server started successfully + expect(fastifyInstance).toBe(mockFastify); + expect(mockFastify.listen).toHaveBeenCalled(); + + // Verify web service was registered but not embedded worker + expect(mockWebService.registerWebService).toHaveBeenCalled(); + expect(mockWorkerService.registerWorkerService).not.toHaveBeenCalled(); }); it("should handle server startup failure gracefully", async () => { @@ -476,10 +471,8 @@ describe("AppServer Behavior Tests", () => { ); await expect(server.start()).rejects.toThrow("Port already in use"); + // Verify that cleanup was attempted expect(mockFastify.close).toHaveBeenCalled(); - expect(mockLogger.error).toHaveBeenCalledWith( - expect.stringContaining("Failed to start AppServer"), - ); }); }); @@ -506,9 +499,6 @@ describe("AppServer Behavior Tests", () => { expect(mockWorkerService.stopWorkerService).toHaveBeenCalledWith(mockPipeline); expect(mockMcpService.cleanupMcpService).toHaveBeenCalledWith(mockMcpServer); expect(mockFastify.close).toHaveBeenCalled(); - expect(mockLogger.info).toHaveBeenCalledWith( - expect.stringContaining("AppServer stopped"), - ); }); it("should not stop worker service when not enabled", async () => { @@ -557,12 +547,8 @@ describe("AppServer Behavior Tests", () => { await server.start(); await expect(server.stop()).rejects.toThrow("Cleanup failed"); + // Verify that stopWorkerService was called before the error was thrown expect(mockWorkerService.stopWorkerService).toHaveBeenCalledWith(mockPipeline); - // Since the error is thrown immediately when stopWorkerService fails, - // the other cleanup operations won't be called - expect(mockLogger.error).toHaveBeenCalledWith( - expect.stringContaining("Failed to stop AppServer gracefully"), - ); }); }); @@ -591,7 +577,6 @@ describe("AppServer Behavior Tests", () => { mockPipeline, mockDocService, ); - expect(mockLogger.info).toHaveBeenCalledWith(expect.stringContaining("API:")); }); it("should handle configuration with both embedded and external worker", async () => { @@ -613,14 +598,8 @@ describe("AppServer Behavior Tests", () => { await server.start(); - // Embedded worker should take precedence + // Embedded worker should take precedence - external worker should not be registered expect(mockWorkerService.registerWorkerService).toHaveBeenCalledWith(mockPipeline); - expect(mockLogger.info).toHaveBeenCalledWith( - expect.stringContaining("Embedded worker: enabled"), - ); - expect(mockLogger.info).not.toHaveBeenCalledWith( - expect.stringContaining("External worker:"), - ); }); it("should validate port number boundaries", async () => { diff --git a/src/cli/index.test.ts b/src/cli/index.test.ts index c375bddf..0785081e 100644 --- a/src/cli/index.test.ts +++ b/src/cli/index.test.ts @@ -13,7 +13,6 @@ import { } from "./utils"; // Mocks for execution tests will be defined below in dedicated describe block -vi.mock("../utils/logger"); // --- Additional mocks for createPipelineWithCallbacks behavior tests --- vi.mock("../pipeline/PipelineFactory", () => ({ diff --git a/src/index.test.ts b/src/index.test.ts index d4b30b6c..50d12506 100644 --- a/src/index.test.ts +++ b/src/index.test.ts @@ -54,17 +54,6 @@ vi.mock("./mcp/tools", () => ({ initializeTools: vi.fn().mockResolvedValue({}), })); -vi.mock("./utils/logger", () => ({ - logger: { - info: vi.fn(), - debug: vi.fn(), - warn: vi.fn(), - error: vi.fn(), - }, - setLogLevel: vi.fn(), - LogLevel: { ERROR: 0, WARN: 1, INFO: 2, DEBUG: 3 }, -})); - vi.mock("playwright", () => ({ chromium: { executablePath: vi.fn().mockReturnValue("/mock/chromium") }, })); diff --git a/src/pipeline/PipelineClient.test.ts b/src/pipeline/PipelineClient.test.ts index 668ffda2..657869ac 100644 --- a/src/pipeline/PipelineClient.test.ts +++ b/src/pipeline/PipelineClient.test.ts @@ -1,8 +1,6 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; import { PipelineClient } from "./PipelineClient"; -vi.mock("../utils/logger"); - // Mock tRPC client factory const mockClient: any = { ping: { query: vi.fn() }, diff --git a/src/pipeline/PipelineFactory.test.ts b/src/pipeline/PipelineFactory.test.ts index 7b40ade1..0f37caa8 100644 --- a/src/pipeline/PipelineFactory.test.ts +++ b/src/pipeline/PipelineFactory.test.ts @@ -7,7 +7,6 @@ import { PipelineManager } from "./PipelineManager"; // Mock dependencies vi.mock("./PipelineManager"); vi.mock("./PipelineClient"); -vi.mock("../utils/logger"); describe("PipelineFactory", () => { let mockDocService: Partial; diff --git a/src/pipeline/PipelineManager.test.ts b/src/pipeline/PipelineManager.test.ts index 25849a9c..ae5113a6 100644 --- a/src/pipeline/PipelineManager.test.ts +++ b/src/pipeline/PipelineManager.test.ts @@ -27,7 +27,6 @@ import { PipelineJobStatus } from "./types"; vi.mock("../store/DocumentManagementService"); vi.mock("../scraper/ScraperService"); vi.mock("./PipelineWorker"); -vi.mock("../utils/logger"); describe("PipelineManager", () => { let mockStore: Partial; diff --git a/src/pipeline/PipelineWorker.test.ts b/src/pipeline/PipelineWorker.test.ts index d1b05ac1..15e24fdc 100644 --- a/src/pipeline/PipelineWorker.test.ts +++ b/src/pipeline/PipelineWorker.test.ts @@ -9,7 +9,6 @@ import { PipelineJobStatus } from "./types"; // Mock dependencies vi.mock("../store/DocumentManagementService"); vi.mock("../scraper/ScraperService"); -vi.mock("../utils/logger"); describe("PipelineWorker", () => { let mockStore: Partial; diff --git a/src/scraper/ScraperRegistry.test.ts b/src/scraper/ScraperRegistry.test.ts index e91fe557..cebf6378 100644 --- a/src/scraper/ScraperRegistry.test.ts +++ b/src/scraper/ScraperRegistry.test.ts @@ -6,8 +6,6 @@ import { LocalFileStrategy } from "./strategies/LocalFileStrategy"; import { NpmScraperStrategy } from "./strategies/NpmScraperStrategy"; import { PyPiScraperStrategy } from "./strategies/PyPiScraperStrategy"; -vi.mock("../utils/logger"); - describe("ScraperRegistry", () => { it("should throw error for unknown URLs", () => { const registry = new ScraperRegistry(); diff --git a/src/scraper/ScraperService.test.ts b/src/scraper/ScraperService.test.ts index 8faa842b..2f6e89d0 100644 --- a/src/scraper/ScraperService.test.ts +++ b/src/scraper/ScraperService.test.ts @@ -5,8 +5,6 @@ import type { ScraperRegistry } from "./ScraperRegistry"; import { ScraperService } from "./ScraperService"; import type { ScraperOptions, ScraperProgressEvent } from "./types"; -vi.mock("../utils/logger"); - describe("ScraperService", () => { // Mock registry const mockRegistry = { diff --git a/src/scraper/fetcher/FileFetcher.test.ts b/src/scraper/fetcher/FileFetcher.test.ts index 7a96497c..49052bfa 100644 --- a/src/scraper/fetcher/FileFetcher.test.ts +++ b/src/scraper/fetcher/FileFetcher.test.ts @@ -4,7 +4,6 @@ import { ScraperError } from "../../utils/errors"; import { FileFetcher } from "./FileFetcher"; vi.mock("node:fs/promises", () => ({ default: vol.promises })); -vi.mock("../../utils/logger"); describe("FileFetcher", () => { beforeEach(() => { diff --git a/src/scraper/fetcher/HttpFetcher.test.ts b/src/scraper/fetcher/HttpFetcher.test.ts index 3330c7c0..23ae38f2 100644 --- a/src/scraper/fetcher/HttpFetcher.test.ts +++ b/src/scraper/fetcher/HttpFetcher.test.ts @@ -3,7 +3,6 @@ import { CancellationError } from "../../pipeline/errors"; import { RedirectError, ScraperError } from "../../utils/errors"; vi.mock("axios"); -vi.mock("../../utils/logger"); import axios from "axios"; diff --git a/src/scraper/middleware/HtmlCheerioParserMiddleware.test.ts b/src/scraper/middleware/HtmlCheerioParserMiddleware.test.ts index 67fcae0c..577dff66 100644 --- a/src/scraper/middleware/HtmlCheerioParserMiddleware.test.ts +++ b/src/scraper/middleware/HtmlCheerioParserMiddleware.test.ts @@ -38,7 +38,6 @@ vi.mock("cheerio", () => { }); // Suppress logger output during tests -vi.mock("../../utils/logger"); // Import cheerio after mocking import * as cheerio from "cheerio"; diff --git a/src/scraper/middleware/HtmlJsExecutorMiddleware.test.ts b/src/scraper/middleware/HtmlJsExecutorMiddleware.test.ts index f8f7a583..a56db3a2 100644 --- a/src/scraper/middleware/HtmlJsExecutorMiddleware.test.ts +++ b/src/scraper/middleware/HtmlJsExecutorMiddleware.test.ts @@ -13,9 +13,6 @@ import { executeJsInSandbox } from "../utils/sandbox"; import { HtmlJsExecutorMiddleware } from "./HtmlJsExecutorMiddleware"; import type { MiddlewareContext } from "./types"; -// Mock the logger -vi.mock("../../../utils/logger"); - // Mock the sandbox utility vi.mock("../utils/sandbox"); diff --git a/src/scraper/middleware/HtmlLinkExtractorMiddleware.test.ts b/src/scraper/middleware/HtmlLinkExtractorMiddleware.test.ts index 6ad2609c..20fc6e21 100644 --- a/src/scraper/middleware/HtmlLinkExtractorMiddleware.test.ts +++ b/src/scraper/middleware/HtmlLinkExtractorMiddleware.test.ts @@ -6,7 +6,6 @@ import { HtmlLinkExtractorMiddleware } from "./HtmlLinkExtractorMiddleware"; import type { MiddlewareContext } from "./types"; // Suppress logger output during tests -vi.mock("../../../utils/logger"); // Helper to create a minimal valid ScraperOptions object const createMockScraperOptions = (url = "http://example.com"): ScraperOptions => ({ diff --git a/src/scraper/middleware/HtmlMetadataExtractorMiddleware.test.ts b/src/scraper/middleware/HtmlMetadataExtractorMiddleware.test.ts index 5b567f22..567e4d22 100644 --- a/src/scraper/middleware/HtmlMetadataExtractorMiddleware.test.ts +++ b/src/scraper/middleware/HtmlMetadataExtractorMiddleware.test.ts @@ -6,7 +6,6 @@ import { HtmlMetadataExtractorMiddleware } from "./HtmlMetadataExtractorMiddlewa import type { MiddlewareContext } from "./types"; // Suppress logger output during tests -vi.mock("../../../utils/logger"); // Helper to create a minimal valid ScraperOptions object const createMockScraperOptions = (url = "http://example.com"): ScraperOptions => ({ diff --git a/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts b/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts index fd126b20..c76fa406 100644 --- a/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts +++ b/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts @@ -17,7 +17,6 @@ import { import type { MiddlewareContext } from "./types"; // Suppress logger output during tests -vi.mock("../../../utils/logger"); // Mock playwright using factory functions vi.mock("playwright", async (importOriginal) => diff --git a/src/scraper/middleware/HtmlSanitizerMiddleware.test.ts b/src/scraper/middleware/HtmlSanitizerMiddleware.test.ts index e38116e5..ab1e534c 100644 --- a/src/scraper/middleware/HtmlSanitizerMiddleware.test.ts +++ b/src/scraper/middleware/HtmlSanitizerMiddleware.test.ts @@ -6,7 +6,6 @@ import { HtmlSanitizerMiddleware } from "./HtmlSanitizerMiddleware"; import type { MiddlewareContext } from "./types"; // Suppress logger output during tests -vi.mock("../../../utils/logger"); // Helper to create a minimal valid ScraperOptions object const createMockScraperOptions = ( diff --git a/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts b/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts index 43f5bb1c..ad52a29b 100644 --- a/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts +++ b/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts @@ -7,7 +7,6 @@ import { HtmlToMarkdownMiddleware } from "./HtmlToMarkdownMiddleware"; import type { MiddlewareContext } from "./types"; // Suppress logger output during tests -vi.mock("../../../utils/logger"); // Helper to create a minimal valid ScraperOptions object const createMockScraperOptions = (url = "http://example.com"): ScraperOptions => ({ diff --git a/src/scraper/middleware/MarkdownLinkExtractorMiddleware.test.ts b/src/scraper/middleware/MarkdownLinkExtractorMiddleware.test.ts index 33b485d9..0bcc1aea 100644 --- a/src/scraper/middleware/MarkdownLinkExtractorMiddleware.test.ts +++ b/src/scraper/middleware/MarkdownLinkExtractorMiddleware.test.ts @@ -4,7 +4,6 @@ import { MarkdownLinkExtractorMiddleware } from "./MarkdownLinkExtractorMiddlewa import type { MiddlewareContext } from "./types"; // Suppress logger output during tests -vi.mock("../../utils/logger"); // Helper to create a minimal valid ScraperOptions object const createMockScraperOptions = (url = "http://example.com"): ScraperOptions => ({ diff --git a/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.test.ts b/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.test.ts index 1e4fc8e7..d97d6477 100644 --- a/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.test.ts +++ b/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.test.ts @@ -4,7 +4,6 @@ import { MarkdownMetadataExtractorMiddleware } from "./MarkdownMetadataExtractor import type { MiddlewareContext } from "./types"; // Suppress logger output during tests -vi.mock("../../utils/logger"); // Helper to create a minimal valid ScraperOptions object const createMockScraperOptions = (url = "http://example.com"): ScraperOptions => ({ diff --git a/src/scraper/pipelines/HtmlPipeline.charset.test.ts b/src/scraper/pipelines/HtmlPipeline.charset.test.ts index 701881ec..cf05c97d 100644 --- a/src/scraper/pipelines/HtmlPipeline.charset.test.ts +++ b/src/scraper/pipelines/HtmlPipeline.charset.test.ts @@ -1,18 +1,8 @@ -import { beforeEach, describe, expect, it, vi } from "vitest"; +import { beforeEach, describe, expect, it } from "vitest"; import { FetchStatus, type RawContent } from "../fetcher/types"; import { ScrapeMode } from "../types"; import { HtmlPipeline } from "./HtmlPipeline"; -// Mock logger -vi.mock("../../utils/logger", () => ({ - logger: { - debug: vi.fn(), - info: vi.fn(), - warn: vi.fn(), - error: vi.fn(), - }, -})); - describe("HtmlPipeline charset integration", () => { let pipeline: HtmlPipeline; diff --git a/src/scraper/strategies/BaseScraperStrategy.test.ts b/src/scraper/strategies/BaseScraperStrategy.test.ts index 884c98fb..c9dd90ce 100644 --- a/src/scraper/strategies/BaseScraperStrategy.test.ts +++ b/src/scraper/strategies/BaseScraperStrategy.test.ts @@ -5,7 +5,6 @@ import type { QueueItem, ScraperOptions, ScraperProgressEvent } from "../types"; import { BaseScraperStrategy } from "./BaseScraperStrategy"; // Mock logger -vi.mock("../../utils/logger"); // Mock implementation for testing abstract class class TestScraperStrategy extends BaseScraperStrategy { diff --git a/src/scraper/strategies/LocalFileStrategy.test.ts b/src/scraper/strategies/LocalFileStrategy.test.ts index dda0b5d6..8a3780fb 100644 --- a/src/scraper/strategies/LocalFileStrategy.test.ts +++ b/src/scraper/strategies/LocalFileStrategy.test.ts @@ -5,7 +5,6 @@ import type { ScrapeResult, ScraperOptions, ScraperProgressEvent } from "../type import { LocalFileStrategy } from "./LocalFileStrategy"; vi.mock("node:fs/promises", () => ({ default: vol.promises })); -vi.mock("../../utils/logger"); vi.mock("node:fs"); describe("LocalFileStrategy", () => { diff --git a/src/scraper/strategies/WebScraperStrategy.test.ts b/src/scraper/strategies/WebScraperStrategy.test.ts index 44461660..0a3becae 100644 --- a/src/scraper/strategies/WebScraperStrategy.test.ts +++ b/src/scraper/strategies/WebScraperStrategy.test.ts @@ -6,7 +6,6 @@ import { ScrapeMode } from "../types"; // Import ScrapeMode import { WebScraperStrategy } from "./WebScraperStrategy"; // Mock dependencies -vi.mock("../../utils/logger"); // Mock HttpFetcher module with a factory vi.mock("../fetcher/HttpFetcher", async (importActual) => { diff --git a/src/scraper/utils/sandbox.test.ts b/src/scraper/utils/sandbox.test.ts index c9e92636..354305e2 100644 --- a/src/scraper/utils/sandbox.test.ts +++ b/src/scraper/utils/sandbox.test.ts @@ -3,15 +3,6 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; import { logger } from "../../utils/logger"; import { executeJsInSandbox } from "./sandbox"; -// Mock the logger -vi.mock("../../utils/logger", () => ({ - logger: { - debug: vi.fn(), - warn: vi.fn(), - error: vi.fn(), - }, -})); - // Mock the JSDOM module vi.mock("jsdom"); diff --git a/src/splitter/GreedySplitter.test.ts b/src/splitter/GreedySplitter.test.ts index 8ef7e19c..e5c788c2 100644 --- a/src/splitter/GreedySplitter.test.ts +++ b/src/splitter/GreedySplitter.test.ts @@ -3,8 +3,6 @@ import { GreedySplitter } from "./GreedySplitter"; import { SemanticMarkdownSplitter } from "./SemanticMarkdownSplitter"; import type { Chunk } from "./types"; -vi.mock("../utils/logger"); - // Mock SemanticMarkdownSplitter const createMockSemanticSplitter = (chunks: Chunk[]) => { const mockSplitText = vi.fn().mockResolvedValue(chunks); diff --git a/src/splitter/SemanticMarkdownSplitter.test.ts b/src/splitter/SemanticMarkdownSplitter.test.ts index 7f0527d5..a281eab6 100644 --- a/src/splitter/SemanticMarkdownSplitter.test.ts +++ b/src/splitter/SemanticMarkdownSplitter.test.ts @@ -1,8 +1,6 @@ -import { describe, expect, it, vi } from "vitest"; +import { describe, expect, it } from "vitest"; import { SemanticMarkdownSplitter } from "./SemanticMarkdownSplitter"; -vi.mock("../utils/logger"); - describe("SemanticMarkdownSplitter", () => { it("should handle empty markdown", async () => { const splitter = new SemanticMarkdownSplitter(100, 5000); diff --git a/src/splitter/splitters/CodeContentSplitter.test.ts b/src/splitter/splitters/CodeContentSplitter.test.ts index e7ee4bf1..e851c011 100644 --- a/src/splitter/splitters/CodeContentSplitter.test.ts +++ b/src/splitter/splitters/CodeContentSplitter.test.ts @@ -1,9 +1,7 @@ -import { describe, expect, it, vi } from "vitest"; +import { describe, expect, it } from "vitest"; import { CodeContentSplitter } from "./CodeContentSplitter"; import type { ContentSplitterOptions } from "./types"; -vi.mock("../../utils/logger"); - describe("CodeContentSplitter", () => { const options = { chunkSize: 100, diff --git a/src/splitter/splitters/TableContentSplitter.test.ts b/src/splitter/splitters/TableContentSplitter.test.ts index d7adfb61..ec181ff0 100644 --- a/src/splitter/splitters/TableContentSplitter.test.ts +++ b/src/splitter/splitters/TableContentSplitter.test.ts @@ -1,10 +1,8 @@ -import { describe, expect, it, vi } from "vitest"; +import { describe, expect, it } from "vitest"; import { MinimumChunkSizeError } from "../errors"; import { TableContentSplitter } from "./TableContentSplitter"; import type { ContentSplitterOptions } from "./types"; -vi.mock("../../utils/logger"); - describe("TableContentSplitter", () => { const options = { chunkSize: 100, diff --git a/src/splitter/splitters/TextContentSplitter.test.ts b/src/splitter/splitters/TextContentSplitter.test.ts index 6955efe6..9af1c1b1 100644 --- a/src/splitter/splitters/TextContentSplitter.test.ts +++ b/src/splitter/splitters/TextContentSplitter.test.ts @@ -1,9 +1,7 @@ -import { describe, expect, it, vi } from "vitest"; +import { describe, expect, it } from "vitest"; import { TextContentSplitter } from "./TextContentSplitter"; import type { ContentSplitterOptions } from "./types"; -vi.mock("../../utils/logger"); - describe("TextContentSplitter", () => { const options = { chunkSize: 100, diff --git a/src/store/DocumentManagementService.test.ts b/src/store/DocumentManagementService.test.ts index 881be474..9628b39a 100644 --- a/src/store/DocumentManagementService.test.ts +++ b/src/store/DocumentManagementService.test.ts @@ -7,7 +7,6 @@ vi.mock("node:fs", () => ({ default: createFsFromVolume(vol), existsSync: vi.fn(vol.existsSync), })); -vi.mock("../utils/logger"); vi.mock("../utils/paths", () => ({ getProjectRoot: vi.fn(() => "/docs-mcp-server"), })); diff --git a/src/store/DocumentRetrieverService.test.ts b/src/store/DocumentRetrieverService.test.ts index 8b94e7e0..fdb0b724 100644 --- a/src/store/DocumentRetrieverService.test.ts +++ b/src/store/DocumentRetrieverService.test.ts @@ -4,7 +4,6 @@ import { DocumentStore } from "./DocumentStore"; import type { DbChunkRank, DbPageChunk } from "./types"; vi.mock("./DocumentStore"); -vi.mock("../utils/logger"); describe("DocumentRetrieverService (consolidated logic)", () => { let retrieverService: DocumentRetrieverService; diff --git a/src/store/embeddings/EmbeddingFactory.test.ts b/src/store/embeddings/EmbeddingFactory.test.ts index f6bc7e87..3141ccc6 100644 --- a/src/store/embeddings/EmbeddingFactory.test.ts +++ b/src/store/embeddings/EmbeddingFactory.test.ts @@ -8,7 +8,6 @@ import { createEmbeddingModel, UnsupportedProviderError } from "./EmbeddingFacto import { FixedDimensionEmbeddings } from "./FixedDimensionEmbeddings"; // Suppress logger output during tests -vi.mock("../../utils/logger"); // Mock process.env for each test const originalEnv = process.env; diff --git a/src/store/embeddings/FixedDimensionEmbeddings.test.ts b/src/store/embeddings/FixedDimensionEmbeddings.test.ts index 4862e8b6..b174a78d 100644 --- a/src/store/embeddings/FixedDimensionEmbeddings.test.ts +++ b/src/store/embeddings/FixedDimensionEmbeddings.test.ts @@ -1,11 +1,10 @@ import { Embeddings } from "@langchain/core/embeddings"; -import { describe, expect, test, vi } from "vitest"; +import { describe, expect, test } from "vitest"; import { DimensionError } from "../errors"; import { VECTOR_DIMENSION } from "../types"; import { FixedDimensionEmbeddings } from "./FixedDimensionEmbeddings"; // Suppress logger output during tests -vi.mock("../../utils/logger"); // Mock embedding models that produce vectors of different sizes class MockBaseEmbeddings extends Embeddings { diff --git a/src/telemetry/analytics.test.ts b/src/telemetry/analytics.test.ts index 0fb27816..2e84e950 100644 --- a/src/telemetry/analytics.test.ts +++ b/src/telemetry/analytics.test.ts @@ -15,13 +15,6 @@ vi.mock("./TelemetryConfig", () => ({ generateInstallationId: vi.fn(() => "test-installation-id"), })); -// Mock the logger -vi.mock("../utils/logger", () => ({ - logger: { - debug: vi.fn(), - }, -})); - // Mock PostHogClient vi.mock("./postHogClient", () => ({ PostHogClient: vi.fn().mockImplementation(() => ({ diff --git a/src/tools/CancelJobTool.test.ts b/src/tools/CancelJobTool.test.ts index aeaf1187..090e65c5 100644 --- a/src/tools/CancelJobTool.test.ts +++ b/src/tools/CancelJobTool.test.ts @@ -6,7 +6,6 @@ import { ToolError } from "./errors"; // Mock dependencies vi.mock("../pipeline/PipelineManager"); -vi.mock("../utils/logger"); describe("CancelJobTool", () => { let mockManagerInstance: Partial; diff --git a/src/tools/ClearCompletedJobsTool.test.ts b/src/tools/ClearCompletedJobsTool.test.ts index c3f6a810..d2702e37 100644 --- a/src/tools/ClearCompletedJobsTool.test.ts +++ b/src/tools/ClearCompletedJobsTool.test.ts @@ -5,7 +5,6 @@ import { ToolError } from "./errors"; // Mock dependencies vi.mock("../pipeline/PipelineManager"); -vi.mock("../utils/logger"); describe("ClearCompletedJobsTool", () => { let mockManagerInstance: Partial; diff --git a/src/tools/FetchUrlTool.test.ts b/src/tools/FetchUrlTool.test.ts index 3f51a375..10c5f694 100644 --- a/src/tools/FetchUrlTool.test.ts +++ b/src/tools/FetchUrlTool.test.ts @@ -6,7 +6,6 @@ import { ToolError, ValidationError } from "./errors"; import { FetchUrlTool, type FetchUrlToolOptions } from "./FetchUrlTool"; // Mock dependencies -vi.mock("../utils/logger"); describe("FetchUrlTool", () => { let mockAutoDetectFetcher: Partial; diff --git a/src/tools/FindVersionTool.test.ts b/src/tools/FindVersionTool.test.ts index 915f893d..76459e09 100644 --- a/src/tools/FindVersionTool.test.ts +++ b/src/tools/FindVersionTool.test.ts @@ -5,7 +5,6 @@ import { FindVersionTool, type FindVersionToolOptions } from "./FindVersionTool" // Mock dependencies vi.mock("../store"); // Mock the entire store module if DocumentManagementService is complex -vi.mock("../utils/logger"); describe("FindVersionTool", () => { let mockDocService: Partial; diff --git a/src/tools/GetJobInfoTool.test.ts b/src/tools/GetJobInfoTool.test.ts index 9f1c182e..1886c3c0 100644 --- a/src/tools/GetJobInfoTool.test.ts +++ b/src/tools/GetJobInfoTool.test.ts @@ -6,7 +6,6 @@ import { GetJobInfoTool } from "./GetJobInfoTool"; // Updated import // Mock dependencies vi.mock("../pipeline/PipelineManager"); -vi.mock("../utils/logger"); describe("GetJobInfoTool", () => { // Updated describe block diff --git a/src/tools/ListJobsTool.test.ts b/src/tools/ListJobsTool.test.ts index c3ed6f45..58a05475 100644 --- a/src/tools/ListJobsTool.test.ts +++ b/src/tools/ListJobsTool.test.ts @@ -7,7 +7,6 @@ import { ListJobsTool } from "./ListJobsTool"; // Mock dependencies vi.mock("../pipeline/PipelineManager"); -vi.mock("../utils/logger"); describe("ListJobsTool", () => { // Define the mock instance directly diff --git a/src/tools/ListLibrariesTool.test.ts b/src/tools/ListLibrariesTool.test.ts index 01234980..50d5cbf6 100644 --- a/src/tools/ListLibrariesTool.test.ts +++ b/src/tools/ListLibrariesTool.test.ts @@ -4,7 +4,6 @@ import { ListLibrariesTool } from "./ListLibrariesTool"; // Mock dependencies vi.mock("../store/DocumentManagementService"); -vi.mock("../utils/logger"); describe("ListLibrariesTool", () => { let mockDocService: Partial; diff --git a/src/tools/RemoveTool.test.ts b/src/tools/RemoveTool.test.ts index 887a1d96..376ef0c7 100644 --- a/src/tools/RemoveTool.test.ts +++ b/src/tools/RemoveTool.test.ts @@ -7,7 +7,6 @@ import { RemoveTool, type RemoveToolArgs } from "./RemoveTool"; // Mock dependencies vi.mock("../store"); -vi.mock("../utils/logger"); // Create a properly typed mock using MockedObject const mockDocService = { diff --git a/src/tools/ScrapeTool.test.ts b/src/tools/ScrapeTool.test.ts index c04d88f9..c178e3e7 100644 --- a/src/tools/ScrapeTool.test.ts +++ b/src/tools/ScrapeTool.test.ts @@ -7,7 +7,6 @@ import { ScrapeTool, type ScrapeToolOptions } from "./ScrapeTool"; // Mock dependencies vi.mock("../pipeline/PipelineManager"); -vi.mock("../utils/logger"); describe("ScrapeTool", () => { let mockManagerInstance: Partial; // Mock manager instance diff --git a/src/tools/SearchTool.test.ts b/src/tools/SearchTool.test.ts index 0c2d2343..41138847 100644 --- a/src/tools/SearchTool.test.ts +++ b/src/tools/SearchTool.test.ts @@ -8,7 +8,6 @@ import type { StoreSearchResult } from "../store/types"; import { SearchTool, type SearchToolOptions } from "./SearchTool"; // Mock dependencies -vi.mock("../utils/logger"); describe("SearchTool", () => { let mockDocService: Partial; diff --git a/src/utils/url.test.ts b/src/utils/url.test.ts index 9d023c93..ca099a2b 100644 --- a/src/utils/url.test.ts +++ b/src/utils/url.test.ts @@ -1,4 +1,4 @@ -import { describe, expect, it, vi } from "vitest"; +import { describe, expect, it } from "vitest"; import { extractPrimaryDomain, hasSameDomain, @@ -7,8 +7,6 @@ import { normalizeUrl, } from "./url"; -vi.mock("./logger"); - describe("URL normalization", () => { describe("default behavior", () => { it("should preserve query parameters", () => { diff --git a/test/setup.ts b/test/setup.ts new file mode 100644 index 00000000..c2ddf165 --- /dev/null +++ b/test/setup.ts @@ -0,0 +1,3 @@ +import { vi } from "vitest"; + +vi.mock("../src/utils/logger"); diff --git a/test/vitest.config.ts b/test/vitest.config.ts index 5b49e591..f6f182be 100644 --- a/test/vitest.config.ts +++ b/test/vitest.config.ts @@ -10,6 +10,7 @@ export default defineConfig({ environment: "node", testTimeout: 30000, // 30 seconds for network operations include: ["test/**/*.test.ts"], + setupFiles: ["./setup.ts"], // // Allow parallel execution with reasonable concurrency // maxConcurrency: 5, // Limit concurrent tests to be respectful to target sites // Add retry for flaky network tests diff --git a/vite.config.ts b/vite.config.ts index bfb63323..ae6a0d49 100644 --- a/vite.config.ts +++ b/vite.config.ts @@ -77,5 +77,6 @@ export default defineConfig({ testTimeout: 5000, include: ["src/**/*.test.ts", "src/**/*.test.tsx"], exclude: ["test/**/*.test.ts"], + setupFiles: ["test/setup.ts"], }, }); From 91a0be39d2e128ff0271f9619ef53388e1705622 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Fri, 14 Nov 2025 09:52:24 -0600 Subject: [PATCH 17/20] fix(scraper): implement include/exclude patterns for link processing in WebScraperStrategy --- .../strategies/WebScraperStrategy.test.ts | 128 ++++++++++++++++++ src/scraper/strategies/WebScraperStrategy.ts | 27 ++-- 2 files changed, 142 insertions(+), 13 deletions(-) diff --git a/src/scraper/strategies/WebScraperStrategy.test.ts b/src/scraper/strategies/WebScraperStrategy.test.ts index 0a3becae..3db9bffd 100644 --- a/src/scraper/strategies/WebScraperStrategy.test.ts +++ b/src/scraper/strategies/WebScraperStrategy.test.ts @@ -689,6 +689,134 @@ describe("WebScraperStrategy", () => { const receivedDocs = progressCallback.mock.calls.map((call) => call[0].result); expect(receivedDocs).toHaveLength(3); // Base + 2 allowed pages }); + + it("should respect includePatterns and excludePatterns from base class", async () => { + mockFetchFn.mockImplementation(async (url: string) => { + if (url === "https://example.com/docs/") { + return { + content: ` + Docs + Guide + API + V2 Docs + V2 Guide + API Endpoint + `, + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + }; + } + return { + content: `${url}${url}`, + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + }; + }); + + options.url = "https://example.com/docs/"; + options.includePatterns = ["docs/*"]; + options.excludePatterns = ["docs/v2/**"]; + options.maxDepth = 2; + options.maxPages = 10; + + const progressCallback = vi.fn>(); + + await strategy.scrape(options, progressCallback); + + // Verify base page was fetched + expect(mockFetchFn).toHaveBeenCalledWith( + "https://example.com/docs/", + expect.anything(), + ); + + // Verify included pages were fetched + expect(mockFetchFn).toHaveBeenCalledWith( + "https://example.com/docs/guide", + expect.anything(), + ); + expect(mockFetchFn).toHaveBeenCalledWith( + "https://example.com/docs/api", + expect.anything(), + ); + + // Verify excluded pages were NOT fetched (v2 docs) + expect(mockFetchFn).not.toHaveBeenCalledWith( + "https://example.com/docs/v2/", + expect.anything(), + ); + expect(mockFetchFn).not.toHaveBeenCalledWith( + "https://example.com/docs/v2/guide", + expect.anything(), + ); + + // Verify page outside include pattern was NOT fetched + expect(mockFetchFn).not.toHaveBeenCalledWith( + "https://example.com/api/endpoint", + expect.anything(), + ); + + // Verify documents were produced only for included and non-excluded pages + const receivedDocs = progressCallback.mock.calls.map((call) => call[0].result); + expect(receivedDocs).toHaveLength(3); // Base + guide + api + }); + + it("should apply excludePatterns even when no includePatterns are specified", async () => { + mockFetchFn.mockImplementation(async (url: string) => { + if (url === "https://example.com/") { + return { + content: ` + Home + Intro + Secret + Blog + `, + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + }; + } + return { + content: `${url}${url}`, + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + }; + }); + + options.url = "https://example.com/"; + options.excludePatterns = ["**/private/**"]; + options.maxDepth = 1; + options.maxPages = 10; + + const progressCallback = vi.fn>(); + + await strategy.scrape(options, progressCallback); + + // Verify base page was fetched + expect(mockFetchFn).toHaveBeenCalledWith("https://example.com/", expect.anything()); + + // Verify non-excluded pages were fetched + expect(mockFetchFn).toHaveBeenCalledWith( + "https://example.com/docs/intro", + expect.anything(), + ); + expect(mockFetchFn).toHaveBeenCalledWith( + "https://example.com/blog/post", + expect.anything(), + ); + + // Verify excluded page was NOT fetched + expect(mockFetchFn).not.toHaveBeenCalledWith( + "https://example.com/docs/private/secret", + expect.anything(), + ); + + // Verify documents + const receivedDocs = progressCallback.mock.calls.map((call) => call[0].result); + expect(receivedDocs).toHaveLength(3); // Base + intro + blog + }); }); // Canonical redirect test: relative links resolve against canonical final URL (directory form) diff --git a/src/scraper/strategies/WebScraperStrategy.ts b/src/scraper/strategies/WebScraperStrategy.ts index 6a55d2a5..df7ca499 100644 --- a/src/scraper/strategies/WebScraperStrategy.ts +++ b/src/scraper/strategies/WebScraperStrategy.ts @@ -5,7 +5,6 @@ import { FetchStatus, type RawContent } from "../fetcher/types"; import { PipelineFactory } from "../pipelines/PipelineFactory"; import type { ContentPipeline, PipelineResult } from "../pipelines/types"; import type { QueueItem, ScraperOptions } from "../types"; -import { isInScope } from "../utils/scope"; import { BaseScraperStrategy, type ProcessItemResult } from "./BaseScraperStrategy"; export interface WebScraperStrategyOptions { @@ -117,23 +116,25 @@ export class WebScraperStrategy extends BaseScraperStrategy { }; } - // Determine base for scope filtering: - // For depth 0 (initial page) use the final fetched URL (rawContent.source) so protocol/host redirects don't drop links. - // For deeper pages, use canonicalBaseUrl (set after first page) or fallback to original. - const baseUrl = - item.depth === 0 - ? new URL(rawContent.source) - : (this.canonicalBaseUrl ?? new URL(options.url)); + // Update canonical base URL from the first page's final URL (after redirects) + if (item.depth === 0) { + this.canonicalBaseUrl = new URL(rawContent.source); + } const filteredLinks = processed.links?.filter((link) => { try { const targetUrl = new URL(link); - const scope = options.scope || "subpages"; - return ( - isInScope(baseUrl, targetUrl, scope) && - (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl)) - ); + // Use the base class's shouldProcessUrl which handles scope + include/exclude patterns + if (!this.shouldProcessUrl(targetUrl.href, options)) { + return false; + } + // Apply optional custom filter function if provided + if (this.shouldFollowLinkFn) { + const baseUrl = this.canonicalBaseUrl ?? new URL(options.url); + return this.shouldFollowLinkFn(baseUrl, targetUrl); + } + return true; } catch { return false; } From 158eda1c26c286796bdd02074343c86421efea36 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Fri, 14 Nov 2025 10:38:40 -0600 Subject: [PATCH 18/20] feat(scraper): enhance URL matching to support full URL and pathname patterns --- src/scraper/utils/patternMatcher.test.ts | 115 +++++++++++++++++++++++ src/scraper/utils/patternMatcher.ts | 20 +++- 2 files changed, 131 insertions(+), 4 deletions(-) diff --git a/src/scraper/utils/patternMatcher.test.ts b/src/scraper/utils/patternMatcher.test.ts index 5fa1ba02..5583f291 100644 --- a/src/scraper/utils/patternMatcher.test.ts +++ b/src/scraper/utils/patternMatcher.test.ts @@ -452,4 +452,119 @@ describe("patternMatcher", () => { ); }); }); + + describe("full URL vs pathname pattern matching", () => { + it("should match patterns against both full URL and pathname", () => { + const testUrl = "https://example.com/docs/v3/guide"; + + // Full URL patterns should work + expect( + shouldIncludeUrl(testUrl, undefined, ["https://example.com/docs/v3/**"]), + ).toBe(false); + expect( + shouldIncludeUrl(testUrl, undefined, ["https://example.com/docs/v2/**"]), + ).toBe(true); // different version, should NOT exclude + + // Path-based patterns should also work + expect(shouldIncludeUrl(testUrl, undefined, ["/docs/v3/**"])).toBe(false); + expect(shouldIncludeUrl(testUrl, undefined, ["/docs/v2/**"])).toBe(true); // different version, should NOT exclude + + // Relative path patterns should work + expect(shouldIncludeUrl(testUrl, undefined, ["docs/v3/**"])).toBe(false); + expect(shouldIncludeUrl(testUrl, undefined, ["docs/v2/**"])).toBe(true); // different version, should NOT exclude + }); + + it("should match directory paths with trailing slash", () => { + const testUrl = "https://example.com/docs/v3/"; + + // Pattern should match both with and without trailing slash + expect(shouldIncludeUrl(testUrl, undefined, ["/docs/v3/**"])).toBe(false); + expect( + shouldIncludeUrl(testUrl, undefined, ["https://example.com/docs/v3/**"]), + ).toBe(false); + expect(shouldIncludeUrl(testUrl, undefined, ["docs/v3/**"])).toBe(false); + }); + + it("should support includePatterns with both full URL and pathname", () => { + const testUrl = "https://example.com/docs/guide"; + + // Full URL include pattern + expect(shouldIncludeUrl(testUrl, ["https://example.com/docs/**"])).toBe(true); + expect(shouldIncludeUrl(testUrl, ["https://example.com/api/**"])).toBe(false); + + // Path-based include pattern + expect(shouldIncludeUrl(testUrl, ["/docs/**"])).toBe(true); + expect(shouldIncludeUrl(testUrl, ["/api/**"])).toBe(false); + + // Relative path include pattern + expect(shouldIncludeUrl(testUrl, ["docs/**"])).toBe(true); + expect(shouldIncludeUrl(testUrl, ["api/**"])).toBe(false); + }); + + it("should handle v3 exclusion with full URL pattern", () => { + const v3Url = "https://example.com/docs/v3/"; + const v3GuideUrl = "https://example.com/docs/v3/getting-started"; + + // Full URL pattern should exclude v3 URLs + expect(shouldIncludeUrl(v3Url, undefined, ["https://example.com/docs/v3/**"])).toBe( + false, + ); + expect( + shouldIncludeUrl(v3GuideUrl, undefined, ["https://example.com/docs/v3/**"]), + ).toBe(false); + }); + + it("should handle v3 exclusion with absolute path pattern", () => { + const v3Url = "https://example.com/docs/v3/"; + const v3GuideUrl = "https://example.com/docs/v3/getting-started"; + + // Absolute path pattern should exclude v3 URLs + expect(shouldIncludeUrl(v3Url, undefined, ["/docs/v3/**"])).toBe(false); + expect(shouldIncludeUrl(v3GuideUrl, undefined, ["/docs/v3/**"])).toBe(false); + }); + + it("should handle v3 exclusion with relative path pattern", () => { + const v3Url = "https://example.com/docs/v3/"; + const v3GuideUrl = "https://example.com/docs/v3/getting-started"; + + // Relative path pattern should exclude v3 URLs + expect(shouldIncludeUrl(v3Url, undefined, ["docs/v3/**"])).toBe(false); + expect(shouldIncludeUrl(v3GuideUrl, undefined, ["docs/v3/**"])).toBe(false); + }); + + it("should support wildcards in domain for full URL patterns", () => { + const testUrl = "https://docs.example.com/guide"; + + // Exact domain match + expect(shouldIncludeUrl(testUrl, undefined, ["https://docs.example.com/**"])).toBe( + false, + ); + + // Different domain should not match + expect(shouldIncludeUrl(testUrl, undefined, ["https://api.example.com/**"])).toBe( + true, + ); + + // Wildcard subdomain (using regex) + expect( + shouldIncludeUrl(testUrl, undefined, ["/https:\\/\\/.*\\.example\\.com\\/.*/"]), + ).toBe(false); + }); + + it("should maintain backward compatibility with existing patterns", () => { + // All existing tests should still pass with the enhanced matching + const testUrl = "https://example.com/docs/archive/old.md"; + + // Pattern without leading slash (relative) + expect(shouldIncludeUrl(testUrl, undefined, ["**/archive/**"])).toBe(false); + + // Pattern with leading slash (absolute path) + expect(shouldIncludeUrl(testUrl, undefined, ["/docs/archive/**"])).toBe(false); + + // Basename matching for file:// URLs + expect( + shouldIncludeUrl("file:///path/to/CHANGELOG.md", undefined, ["**/CHANGELOG.md"]), + ).toBe(false); + }); + }); }); diff --git a/src/scraper/utils/patternMatcher.ts b/src/scraper/utils/patternMatcher.ts index 20ce7a68..ef84d46f 100644 --- a/src/scraper/utils/patternMatcher.ts +++ b/src/scraper/utils/patternMatcher.ts @@ -48,9 +48,12 @@ export function matchesAnyPattern(path: string, patterns?: string[]): boolean { if (isRegexPattern(pattern)) { return patternToRegExp(pattern).test(normalizedPath); } - // minimatch expects no leading slash for relative globs, but we keep it for consistency - // so we strip the leading slash for minimatch - return minimatch(normalizedPath.replace(/^\//, ""), pattern, { dot: true }); + // For glob patterns: + // - If pattern starts with '/', strip leading slash from BOTH pattern and path for minimatch + // - Otherwise, strip leading slash only from path + const pathForMatch = normalizedPath.replace(/^\//, ""); + const patternForMatch = pattern.startsWith("/") ? pattern.slice(1) : pattern; + return minimatch(pathForMatch, patternForMatch, { dot: true }); }); } @@ -73,15 +76,20 @@ export function extractPathAndQuery(url: string): string { * If no user exclude patterns are provided, default exclusion patterns are automatically applied. * These defaults exclude common documentation files (CHANGELOG.md, LICENSE, etc.) and folders * (archives, non-English locales, etc.). + * + * Patterns are matched against both the full URL and the pathname for maximum flexibility: + * - Full URL: `https://example.com/docs/v3/**` matches `https://example.com/docs/v3/guide` + * - Pathname: `/docs/v3/**` matches `https://example.com/docs/v3/guide` */ export function shouldIncludeUrl( url: string, includePatterns?: string[], excludePatterns?: string[], ): boolean { - // Always match from a leading slash for path-based globs + // Extract pathname for path-based pattern matching const path = extractPathAndQuery(url); const normalizedPath = path.startsWith("/") ? path : `/${path}`; + // For file:// URLs, also match against the basename (strip leading slash from pattern for basename matching) let basename: string | undefined; if (url.startsWith("file://")) { @@ -98,13 +106,17 @@ export function shouldIncludeUrl( const effectiveExcludePatterns = getEffectiveExclusionPatterns(excludePatterns); // Exclude patterns take precedence + // Match against BOTH full URL and pathname for flexibility if ( + matchesAnyPattern(url, effectiveExcludePatterns) || matchesAnyPattern(normalizedPath, effectiveExcludePatterns) || (basename && matchesAnyPattern(basename, stripSlash(effectiveExcludePatterns))) ) return false; if (!includePatterns || includePatterns.length === 0) return true; + // Match against BOTH full URL and pathname for flexibility return ( + matchesAnyPattern(url, includePatterns) || matchesAnyPattern(normalizedPath, includePatterns) || (basename ? matchesAnyPattern(basename, stripSlash(includePatterns)) : false) ); From 838af52169c9dd05197b10ff42eeb88b5c9229b4 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Fri, 14 Nov 2025 11:48:19 -0600 Subject: [PATCH 19/20] feat(pipeline): enhance enqueueRefreshJob to handle incomplete versions with full re-scrape --- src/pipeline/PipelineManager.test.ts | 125 +++++++++++++++++++++++++ src/pipeline/PipelineManager.ts | 24 +++++ src/store/DocumentManagementService.ts | 14 +++ src/store/DocumentStore.ts | 32 +++++++ 4 files changed, 195 insertions(+) diff --git a/src/pipeline/PipelineManager.test.ts b/src/pipeline/PipelineManager.test.ts index ae5113a6..89a80f46 100644 --- a/src/pipeline/PipelineManager.test.ts +++ b/src/pipeline/PipelineManager.test.ts @@ -114,6 +114,18 @@ describe("PipelineManager", () => { ensureVersion: vi.fn().mockResolvedValue(1), getPagesByVersionId: vi.fn().mockResolvedValue([]), getScraperOptions: vi.fn().mockResolvedValue(null), + getVersionById: vi.fn().mockResolvedValue({ + id: 1, + library_id: 1, + name: "1.0.0", + status: "completed", + created_at: "2025-01-01T00:00:00.000Z", + updated_at: "2025-01-01T00:01:00.000Z", + }), + getLibraryById: vi.fn().mockResolvedValue({ + id: 1, + name: "test-lib", + }), }; // Mock the worker's executeJob method @@ -729,5 +741,118 @@ describe("PipelineManager", () => { expect(shallowItem?.etag).toBe(null); expect(shallowItem?.pageId).toBe(11); }); + + it("should perform full re-scrape instead of refresh when version is not completed", async () => { + // Setup: Mock an incomplete version (failed scrape) + const mockPages = [ + { id: 1, url: "https://example.com/page1", depth: 0, etag: "etag1" }, + { id: 2, url: "https://example.com/page2", depth: 1, etag: "etag2" }, + ]; + + (mockStore.ensureVersion as Mock).mockResolvedValue(555); + (mockStore.getVersionById as Mock).mockResolvedValue({ + id: 555, + library_id: 1, + name: "1.0.0", + status: "failed", // Version was not completed + created_at: "2025-01-01T00:00:00.000Z", + updated_at: "2025-01-01T00:01:00.000Z", + }); + (mockStore.getLibraryById as Mock).mockResolvedValue({ + id: 1, + name: "incomplete-lib", + }); + (mockStore.getPagesByVersionId as Mock).mockResolvedValue(mockPages); + (mockStore.getScraperOptions as Mock).mockResolvedValue({ + sourceUrl: "https://example.com", + options: { maxDepth: 2 }, + }); + + // Spy on enqueueJobWithStoredOptions to verify it's called + const enqueueStoredSpy = vi.spyOn(manager, "enqueueJobWithStoredOptions"); + enqueueStoredSpy.mockResolvedValue("mock-job-id"); + + // Action: Attempt to enqueue a refresh job + const jobId = await manager.enqueueRefreshJob("incomplete-lib", "1.0.0"); + + // Assertions: Should have called enqueueJobWithStoredOptions instead of normal refresh + expect(enqueueStoredSpy).toHaveBeenCalledWith("incomplete-lib", "1.0.0"); + expect(jobId).toBe("mock-job-id"); + + // Should NOT have called getPagesByVersionId since we're doing a full re-scrape + expect(mockStore.getPagesByVersionId).not.toHaveBeenCalled(); + }); + + it("should perform full re-scrape for queued versions during refresh", async () => { + // Setup: Mock a queued version (never started) + (mockStore.ensureVersion as Mock).mockResolvedValue(666); + (mockStore.getVersionById as Mock).mockResolvedValue({ + id: 666, + library_id: 2, + name: "2.0.0", + status: "queued", // Version is still queued + created_at: "2025-01-01T00:00:00.000Z", + updated_at: "2025-01-01T00:00:00.000Z", + }); + (mockStore.getLibraryById as Mock).mockResolvedValue({ + id: 2, + name: "queued-lib", + }); + (mockStore.getScraperOptions as Mock).mockResolvedValue({ + sourceUrl: "https://example.com", + options: {}, + }); + + // Spy on enqueueJobWithStoredOptions + const enqueueStoredSpy = vi.spyOn(manager, "enqueueJobWithStoredOptions"); + enqueueStoredSpy.mockResolvedValue("queued-job-id"); + + // Action: Attempt to enqueue a refresh job + await manager.enqueueRefreshJob("queued-lib", "2.0.0"); + + // Assertions: Should perform full re-scrape for queued versions + expect(enqueueStoredSpy).toHaveBeenCalledWith("queued-lib", "2.0.0"); + }); + + it("should perform normal refresh for completed versions", async () => { + // Setup: Mock a completed version + const mockPages = [ + { id: 1, url: "https://example.com/page1", depth: 0, etag: "etag1" }, + ]; + + (mockStore.ensureVersion as Mock).mockResolvedValue(777); + (mockStore.getVersionById as Mock).mockResolvedValue({ + id: 777, + library_id: 3, + name: "3.0.0", + status: "completed", // Version is completed successfully + created_at: "2025-01-01T00:00:00.000Z", + updated_at: "2025-01-01T00:01:00.000Z", + }); + (mockStore.getLibraryById as Mock).mockResolvedValue({ + id: 3, + name: "completed-lib", + }); + (mockStore.getPagesByVersionId as Mock).mockResolvedValue(mockPages); + (mockStore.getScraperOptions as Mock).mockResolvedValue({ + sourceUrl: "https://example.com", + options: {}, + }); + + // Spy on enqueueJobWithStoredOptions to ensure it's NOT called + const enqueueStoredSpy = vi.spyOn(manager, "enqueueJobWithStoredOptions"); + + // Action: Enqueue a refresh job + const jobId = await manager.enqueueRefreshJob("completed-lib", "3.0.0"); + + // Assertions: Should perform normal refresh, NOT full re-scrape + expect(enqueueStoredSpy).not.toHaveBeenCalled(); + expect(mockStore.getPagesByVersionId).toHaveBeenCalledWith(777); + + const job = await manager.getJob(jobId); + expect(job).toBeDefined(); + expect(job?.library).toBe("completed-lib"); + expect(job?.version).toBe("3.0.0"); + }); }); }); diff --git a/src/pipeline/PipelineManager.ts b/src/pipeline/PipelineManager.ts index 8dc9a7fd..4f1ce399 100644 --- a/src/pipeline/PipelineManager.ts +++ b/src/pipeline/PipelineManager.ts @@ -316,6 +316,9 @@ export class PipelineManager implements IPipeline { /** * Enqueues a refresh job for an existing library version by re-scraping all pages * and using ETag comparison to skip unchanged content. + * + * If the version was never completed (interrupted or failed scrape), performs a + * full re-scrape from scratch instead of a refresh to ensure completeness. */ async enqueueRefreshJob( library: string, @@ -331,6 +334,27 @@ export class PipelineManager implements IPipeline { version: normalizedVersion, }); + // Check the version's status to detect incomplete scrapes + const versionInfo = await this.store.getVersionById(versionId); + if (!versionInfo) { + throw new Error(`Version ID ${versionId} not found`); + } + + // Get library information + const libraryInfo = await this.store.getLibraryById(versionInfo.library_id); + if (!libraryInfo) { + throw new Error(`Library ID ${versionInfo.library_id} not found`); + } + + // If the version is not completed, it means the previous scrape was interrupted + // or failed. In this case, perform a full re-scrape instead of a refresh. + if (versionInfo && versionInfo.status !== VersionStatus.COMPLETED) { + logger.info( + `⚠️ Version ${library}@${normalizedVersion || "unversioned"} has status "${versionInfo.status}". Performing full re-scrape instead of refresh.`, + ); + return this.enqueueJobWithStoredOptions(library, normalizedVersion); + } + // Get all pages for this version with their ETags and depths const pages = await this.store.getPagesByVersionId(versionId); diff --git a/src/store/DocumentManagementService.ts b/src/store/DocumentManagementService.ts index 7f660cd0..34299e12 100644 --- a/src/store/DocumentManagementService.ts +++ b/src/store/DocumentManagementService.ts @@ -508,4 +508,18 @@ export class DocumentManagementService { return versionId; } + + /** + * Retrieves a version by its ID from the database. + */ + async getVersionById(versionId: number) { + return this.store.getVersionById(versionId); + } + + /** + * Retrieves a library by its ID from the database. + */ + async getLibraryById(libraryId: number) { + return this.store.getLibraryById(libraryId); + } } diff --git a/src/store/DocumentStore.ts b/src/store/DocumentStore.ts index a6c0e022..033865d0 100644 --- a/src/store/DocumentStore.ts +++ b/src/store/DocumentStore.ts @@ -96,6 +96,7 @@ export class DocumentStore { getParentChunk: Database.Statement<[string, string, string, string, bigint]>; insertLibrary: Database.Statement<[string]>; getLibraryIdByName: Database.Statement<[string]>; + getLibraryById: Database.Statement<[number]>; // New version-related statements insertVersion: Database.Statement<[number, string | null]>; resolveVersionId: Database.Statement<[number, string | null]>; @@ -219,6 +220,7 @@ export class DocumentStore { getLibraryIdByName: this.db.prepare<[string]>( "SELECT id FROM libraries WHERE name = ?", ), + getLibraryById: this.db.prepare<[number]>("SELECT * FROM libraries WHERE id = ?"), // New version-related statements insertVersion: this.db.prepare<[number, string]>( "INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING", @@ -672,6 +674,36 @@ export class DocumentStore { } } + /** + * Retrieves a version by its ID. + * @param versionId The version ID to retrieve + * @returns The version record, or null if not found + */ + async getVersionById(versionId: number): Promise { + try { + const row = this.statements.getVersionById.get(versionId) as DbVersion | undefined; + return row || null; + } catch (error) { + throw new StoreError(`Failed to get version by ID: ${error}`); + } + } + + /** + * Retrieves a library by its ID. + * @param libraryId The library ID to retrieve + * @returns The library record, or null if not found + */ + async getLibraryById(libraryId: number): Promise<{ id: number; name: string } | null> { + try { + const row = this.statements.getLibraryById.get(libraryId) as + | { id: number; name: string } + | undefined; + return row || null; + } catch (error) { + throw new StoreError(`Failed to get library by ID: ${error}`); + } + } + /** * Stores scraper options for a version to enable reproducible indexing. * @param versionId The version ID to update From 5dfe2221091334780c7523c605e987d5a91430d3 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Fri, 14 Nov 2025 11:51:37 -0600 Subject: [PATCH 20/20] fix(tests): correct setupFiles path in vitest configuration --- test/vitest.config.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/vitest.config.ts b/test/vitest.config.ts index f6f182be..4f5dbd3f 100644 --- a/test/vitest.config.ts +++ b/test/vitest.config.ts @@ -10,7 +10,7 @@ export default defineConfig({ environment: "node", testTimeout: 30000, // 30 seconds for network operations include: ["test/**/*.test.ts"], - setupFiles: ["./setup.ts"], + setupFiles: ["test/setup.ts"], // // Allow parallel execution with reasonable concurrency // maxConcurrency: 5, // Limit concurrent tests to be respectful to target sites // Add retry for flaky network tests