diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 69841ed9..135cb3dd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,9 +2,9 @@ name: CI on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] jobs: lint: @@ -17,8 +17,8 @@ jobs: - name: Set up Node.js uses: actions/setup-node@v4 with: - node-version: '>=20.0.0' # Match engines requirement in package.json - cache: 'npm' + node-version: ">=20.0.0" # Match engines requirement in package.json + cache: "npm" - name: Install dependencies run: npm ci @@ -26,6 +26,9 @@ jobs: - name: Run linter run: npm run lint + - name: Run type checker + run: npm run typecheck + test: name: Test runs-on: ubuntu-latest @@ -37,8 +40,8 @@ jobs: - name: Set up Node.js uses: actions/setup-node@v4 with: - node-version: '>=20.0.0' - cache: 'npm' + node-version: ">=20.0.0" + cache: "npm" - name: Install dependencies run: npm ci @@ -60,8 +63,8 @@ jobs: - name: Set up Node.js uses: actions/setup-node@v4 with: - node-version: '>=20.0.0' - cache: 'npm' + node-version: ">=20.0.0" + cache: "npm" - name: Install dependencies run: npm ci diff --git a/AGENTS.md b/AGENTS.md index a459d330..b8177854 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,83 +1,171 @@ -# Custom Instructions +# Agent Instructions for docs-mcp-server -- The repository for this project is located on GitHub at `arabold/docs-mcp-server`. -- You must read the `README.md` to understand the project structure and setup. -- You must read the `ARCHITECTURE.md` file before making changes across multiple services. -- You must follow DRY, KISS, YAGNI, and SOLID principles. -- You must use the latest version of the programming language and libraries. -- Prefer the simplest solution. -- Never commit secrets, credentials, or sensitive data to the repository. +## Repository Context + +- Repository: `arabold/docs-mcp-server` +- Read `README.md` for project structure and setup +- Read `ARCHITECTURE.md` before making changes across multiple services +- Follow DRY, KISS, YAGNI, and SOLID principles +- Use latest stable versions of programming language and libraries +- Prefer the simplest solution that meets requirements +- Never commit secrets, credentials, or sensitive data ## Documentation -- The `README.md` targets end users that utilize the library for the first time. It should primarily cover prerequisites, installation, configuration, first start, trouble shooting. -- The `ARCHITECTURE.md` targets developers making active changes to the code. It should give a high level overview of the architecture of the library, a feature list, and then reference individual feature documentations in the docs/ folder. -- Write in present tense, describing how the system currently works -- Focus on what the system does, not what it doesn't do or used to do -- Avoid discussing past problems, bugs, or alternative approaches unless directly relevant to understanding the current design -- Use declarative statements rather than explanatory narratives -- Don't include "Important" callouts or emphasis unless documenting critical constraints -- Avoid problem/solution framing - just describe the current behavior and its rationale -- Keep examples focused on illustrating current functionality, not contrasting with previous versions -- Do not create new documentation files unless explicitly asked to. Instead update existing files or create new sections as needed. +### File Targets + +- `README.md` targets end users: prerequisites, installation, configuration, first start, troubleshooting +- `ARCHITECTURE.md` targets active developers: high-level architecture, feature list, references to `docs/` folder +- `docs/` folder provides deep dives into specific features, subsystems, or technical concepts + +### Writing Principles + +- Use present tense to describe current system behavior +- Use declarative statements, not explanatory narratives +- Describe what the system does, not what it doesn't do or used to do +- Avoid problem/solution framing - describe current behavior and rationale +- Omit "Important" callouts unless documenting critical constraints or safety issues +- Keep examples focused on current functionality, not historical comparisons +- Update existing documentation or add sections; only create new files when explicitly requested + +### Structure Guidelines + +- Start with high-level overview before details +- Use clear, descriptive section headers +- Progress from concepts to specifics (allows readers to stop when satisfied) +- Use tables for comparing options, statuses, or behaviors +- Include Mermaid diagrams for workflows, state machines, or component relationships +- Focus on high-level concepts and component relationships (use class/interface names when helpful, as they change less frequently than implementation details) +- Explain architectural decisions with trade-offs +- Avoid explaining source code implementation - use TSDoc comments in source files instead ### Source Code Documentation -- Ensure each source file begins with a comment block summarizing its purpose and logic. -- If no block exists, create one before editing. -- After completing changes, update this block to reflect the changes. -- Always make the comment block clear and concise. - -## Architecture - -- Focus on system concepts and component relationships. -- Put implementation details in source code. -- Update `ARCHITECTURE.md` when the architecture changes. -- Do not use special characters like braces in mermaid diagram titles or names. Quote them if necessary. -- Do not use markdown in mermaid diagrams. - -## TypeScript - -- Install dependencies using `npm install` inside `apps/` instead of adding them to the `package.json` file manually. -- We're using Node.js 22.x, `vite-node` for running TypeScript files, and `vitest` for testing. -- Prefer a specific type or `unknown` over `any`. -- Do not use non-null assertions (`!`). Use optional chaining (`?.`) or nullish coalescing (`??`). -- Follow `biome` for formatting and import order. -- Always place `import` statements at the top of the file. - -## Web UI - -- Use AlpineJS for frontend components and TailwindCSS for styling. -- Use TSX with kitajs for AlpineJS components. -- Use HTMX for server-side interactions. -- Avoid `{foo && }` in TSX; use ternary expressions instead. - -## Logging - -- Use `console.*` for CLI user output (results, direct feedback). -- Use `logger.info/warn/error` for meaningful application events; prefix with a relevant emoji. -- Use `logger.debug` for detailed developer/tracing logs; no emoji prefix. -- Prefer `logger.debug` over `logger.info` for granular internal steps to reduce log verbosity. - -## Testing - -- Consider maintainability and efforts when writing tests. -- Always create unit test files alongside the source file with `.test.ts` suffix. -- Focus on high value, low effort tests first. Defer complex mocking, complex state management testing and concurrent processing unless explicitly requested by the user. -- Always test the intended bevavior, not the implementation details. -- Avoid timing sensitive tests unless absolutely necessary. - -## Git - -- Branches must be created locally before pushing. -- Branch names must be prefixed with type (`feature/`, `bugfix/`, `chore/`) and include the issue number if available (e.g., `feature/1234-description`). -- All commit messages must use Conventional Commits (`feat:`, `fix:`, etc.). -- Commit subject must be imperative mood and ≤72 characters. -- If a commit body is present, add a blank line before it. -- Commit body (for non-trivial changes) must explain what and why, not how. -- Reference related issues in commit messages when relevant (e.g., `Closes #123`). -- Do not include unrelated changes in a single commit. -- Do not use vague or generic commit messages. -- Pull request descriptions must summarize the what and why of all changes in the branch (not just a list of commits or the how). -- Pull requests must target `main` unless specified otherwise. -- When creating new GitHub issues, use built-in labels to categorize them (e.g., `bug`, `enhancement`, `documentation`) but avoid creating new labels unless explicitly asked to. +- Document source code with TSDoc comments (not in separate documentation files) +- Each source file must begin with a comment block summarizing purpose and logic +- Create the comment block before editing if it doesn't exist +- Update the comment block after completing changes +- Keep comment blocks clear and concise + +## Architecture Documentation + +- Focus on system concepts and component relationships +- Place implementation details in source code, not architecture docs +- Update `ARCHITECTURE.md` when architecture changes +- In Mermaid diagrams: + - Avoid special characters (e.g., braces) in titles or names; quote if necessary + - Do not use markdown formatting + +## TypeScript Conventions + +### Dependencies and Tooling + +- Install dependencies via `npm install` (not by manually editing `package.json`) +- Runtime: Node.js 22.x +- Execution: `vite-node` for running TypeScript files +- Testing: `vitest` + +### Type Safety + +- Prefer specific types or `unknown` over `any` +- Avoid non-null assertions (`!`) +- Use optional chaining (`?.`) and nullish coalescing (`??`) + +### Code Style + +- Follow `biome` for formatting and import order +- Place all `import` statements at the top of files + +## Web UI Stack + +- Frontend components: AlpineJS +- Styling: TailwindCSS +- AlpineJS components: TSX with kitajs +- Server-side interactions: HTMX +- TSX pattern: Use ternary expressions (`{foo ? : null}`), not short-circuit evaluation (`{foo && }`) + +## Logging Strategy + +### Output Channels + +- `console.*`: CLI user output (results, direct feedback to user) +- `logger.info/warn/error`: Meaningful application events (prefix with relevant emoji) +- `logger.debug`: Detailed developer/tracing logs (no emoji prefix) + +### Verbosity Control + +- Prefer `logger.debug` over `logger.info` for granular internal steps +- Reduces default log verbosity while maintaining debugging capability + +## Testing Approach + +### Test Files + +- Unit tests: alongside source files with `.test.ts` suffix +- E2E tests: in `test/` directory with `*-e2e.test.ts` suffix +- Run: `npx vite-node ` + +### Testing Philosophy + +**Core Principle**: Test observable behavior (contracts), not implementation details. + +**Test the "what", not the "how"**: + +- ✅ "File change detection returns SUCCESS for modified files" (observable behavior) +- ❌ "ETag generated from mtime timestamp" (implementation detail) + +**Prefer integration over isolation**: + +- E2E tests > Integration tests > Unit tests +- Default to E2E for new features (highest confidence) +- Add integration tests when components don't interact correctly +- Add unit tests only for complex logic requiring detailed verification + +**What to test**: + +- Public contracts and API boundaries +- Integration points between components +- Complete workflows end-to-end +- Critical business logic + +**What to skip**: + +- Private methods and internal state +- Simple getters/setters and obvious mappings +- Trivial parameter validation +- Implementation-specific details (algorithms, data structures) + +**Quality markers**: + +- Fast: unit tests <100ms, suite <5s +- Focused: one behavior per test +- Maintainable: refactoring doesn't break tests unless behavior changes +- Realistic: tests reflect actual usage patterns + +## Git Workflow + +### Branching + +- Create branches locally before pushing +- Branch naming: `/-` (e.g., `feature/1234-add-refresh-logic`) +- Types: `feature/`, `bugfix/`, `chore/` + +### Commits + +- Format: Conventional Commits (`feat:`, `fix:`, `docs:`, `refactor:`, `test:`, `chore:`) +- Subject: Imperative mood, ≤72 characters +- Body: Separate from subject with blank line +- Body content: Explain what and why, not how (for non-trivial changes) +- Reference issues when relevant (e.g., `Closes #123`) +- One logical change per commit (no unrelated changes) +- Avoid vague messages (e.g., "fix bug", "update code") + +### Pull Requests + +- Description: Summarize what and why of all changes (not just commit list or how) +- Target: `main` branch unless specified otherwise + +### Issues + +- Use built-in labels to categorize (e.g., `bug`, `enhancement`, `documentation`) +- Avoid creating new labels unless explicitly requested diff --git a/biome.json b/biome.json index 4547507c..d6f11719 100644 --- a/biome.json +++ b/biome.json @@ -1,5 +1,5 @@ { - "$schema": "https://biomejs.dev/schemas/2.2.0/schema.json", + "$schema": "https://biomejs.dev/schemas/2.3.2/schema.json", "assist": { "actions": { "source": { diff --git a/db/migrations/010-add-depth-to-pages.sql b/db/migrations/010-add-depth-to-pages.sql new file mode 100644 index 00000000..86a13e43 --- /dev/null +++ b/db/migrations/010-add-depth-to-pages.sql @@ -0,0 +1,16 @@ +-- Migration 010: Add depth column to pages table for refresh functionality +-- This enables tracking the original crawl depth of each page, which is essential +-- for maintaining consistent depth constraints during refresh operations. + +-- Add depth column to pages table +ALTER TABLE pages ADD COLUMN depth INTEGER; + +-- Backfill depth based on stored scraper options +-- Depth 0: Pages whose URL exactly matches the source_url in scraper_options +-- Depth 1: All other pages (discovered during crawl) +UPDATE pages SET depth = CASE + WHEN url = (SELECT source_url FROM versions WHERE versions.id = pages.version_id) + THEN 0 + ELSE 1 +END +WHERE depth IS NULL; diff --git a/docs/refresh-architecture.md b/docs/refresh-architecture.md new file mode 100644 index 00000000..d8598c6d --- /dev/null +++ b/docs/refresh-architecture.md @@ -0,0 +1,525 @@ +# Refresh Architecture + +## Overview + +The refresh system enables efficient re-indexing of previously scraped documentation by leveraging **HTTP conditional requests** and **intelligent change detection**. Instead of re-downloading and re-processing all content, refresh operations check each page for modifications and only process what has changed. + +**Key efficiency gains:** + +- 70-90% reduction in bandwidth usage for typical documentation updates +- Proportional reduction in processing time (unchanged pages skip pipeline entirely) +- Automatic detection and removal of deleted pages +- Discovery and indexing of newly added pages + +The refresh system integrates seamlessly with the existing scraping pipeline, using the same strategies, fetchers, and processors as initial indexing operations. + +--- + +## Core Mechanism: Conditional Requests + +Refresh operations rely on **ETags** (entity tags) - unique identifiers assigned by web servers to specific versions of a resource. When content changes, the ETag changes. + +### How It Works + +**Initial Scraping:** + +1. Fetch page from server +2. Extract content and links +3. Store content in database **with ETag** +4. Continue to discovered links + +**Refresh Operation:** + +1. Load existing pages from database (URL + ETag + pageId) +2. Fetch page with `If-None-Match: ` header +3. Server compares ETags and responds: + - **304 Not Modified** → Content unchanged, skip processing + - **200 OK** → Content changed, re-process through pipeline + - **404 Not Found** → Page deleted, remove from index + +This approach shifts the burden of change detection to the HTTP layer, where it's handled efficiently by web servers and CDNs. + +--- + +## Status Handling + +The system handles three HTTP response statuses during refresh: + +| Status Code | Meaning | Database Action | Pipeline Action | +| -------------------- | ----------------------------------- | ------------------------------------- | ------------------------------- | +| **304 Not Modified** | Content unchanged since last scrape | No changes (preserves existing data) | Skip pipeline, no re-processing | +| **200 OK** | Content modified or new page | Delete old chunks, insert new content | Full pipeline processing | +| **404 Not Found** | Page no longer exists | Delete all documents for this page | Skip pipeline | + +### 304 Not Modified Flow + +When a page returns 304, the system: + +1. Recognizes the page was checked successfully +2. Preserves all existing content in database (no updates) +3. Skips chunking, embedding, and indexing entirely +4. Continues to next page in queue + +This is the **fast path** that makes refresh efficient. + +### 200 OK Flow + +When a page returns 200 with new content, the system: + +1. Deletes existing document chunks for this page (by pageId) +2. Re-processes through full pipeline (HTML→Markdown, chunking, embeddings) +3. Inserts new chunks with updated embeddings +4. Updates page metadata (ETag, last_modified, title, etc.) +5. Extracts and follows new links + +This ensures modified content is always current. + +### 404 Not Found Flow + +When a page returns 404, the system: + +1. Deletes the page record AND all associated document chunks (by pageId) +2. Reports deletion via progress callback with `deleted: true` flag +3. Does not follow any links from deleted pages + +**Note:** The `deletePage()` method performs a complete deletion of the page and all its document chunks. This is a hard delete operation that immediately removes the page from search results. The CASCADE DELETE constraint in the database schema ensures all related documents are automatically removed when a page is deleted. + +--- + +## Database Schema + +### Pages Table + +The `pages` table stores page-level metadata with the following key fields: + +- **`id`**: Primary key for the page +- **`version_id`**: Foreign key to the versions table +- **`url`**: The page's URL (unique per version) +- **`title`**: Page title extracted from content +- **`etag`**: HTTP ETag header for change detection +- **`last_modified`**: HTTP Last-Modified header +- **`content_type`**: MIME type of the content +- **`depth`**: Crawl depth at which the page was discovered +- **`created_at`**: Timestamp when page was first indexed +- **`updated_at`**: Timestamp of last update (automatically maintained by triggers) + +The combination of `(version_id, url)` is unique, ensuring one page record per URL per version. + +### Documents Table + +The `documents` table stores individual content chunks: + +- **`id`**: Primary key for the chunk +- **`page_id`**: Foreign key to the pages table +- **`content`**: The text content of this chunk +- **`metadata`**: JSON containing chunk-specific metadata (level, path, types) +- **`sort_order`**: Order of this chunk within the page +- **`embedding`**: Vector embedding for similarity search +- **`created_at`**: Timestamp when chunk was created + +Multiple document chunks link to a single page via `page_id`. + +--- + +## Refresh Workflow + +```mermaid +graph TD + A[Start Refresh] --> B[Load Existing Pages from DB] + B --> C[Create initialQueue with pageId + ETag + depth] + C --> D{Root URL in DB?} + D -->|No| E[Add Root URL at depth 0] + D -->|Yes| F[Root URL already in queue] + E --> G[Begin Scraping] + F --> G + + G --> H[Process Queue Item] + H --> I[Fetch with ETag] + + I --> J{HTTP Status} + J -->|304| K[Skip Processing] + J -->|200| L[Delete Old Chunks] + J -->|404| M[Delete Page & Chunks] + + K --> N[Continue to Next] + L --> O[Full Pipeline Processing] + M --> P[Report Deletion] + + O --> Q[Insert New Chunks] + Q --> R[Update Page Metadata] + R --> S[Extract Links] + + N --> T{More in Queue?} + P --> T + S --> U[Add New Links to Queue] + U --> T + T -->|Yes| H + T -->|No| V[Complete] +``` + +--- + +## Full Re-Crawl Behavior + +Despite using conditional requests, refresh operations perform a **full re-crawl** of the documentation structure. This design choice is intentional and critical for correctness. + +### Why Full Re-Crawl? + +**Link structure can change without content changing:** + +- Page A (unchanged, 304) might add a link to new Page B +- Page C might remove a link, making Page D unreachable +- Navigation menus can be updated without content changes + +**If we only followed stored pages:** + +- Newly added pages would never be discovered +- Reorganizations would break coverage +- Deleted pages might remain in index indefinitely + +### How It Works + +1. **Start from root URL** (depth 0) with ETag check +2. **Even if root returns 304**, extract its links and follow them +3. **Discover new pages** not in the database (no ETag, no pageId) +4. **Process discovered pages** through full pipeline +5. **Delete chunks for 404 pages** to remove from search + +This approach combines the efficiency of conditional requests (skip unchanged pages) with the completeness of full crawling (find new pages). + +--- + +## Link Discovery and Depth Preservation + +### Initial Queue Setup + +Refresh operations receive an `initialQueue` parameter containing all previously indexed pages: + +```typescript +initialQueue: [ + { url: "https://docs.example.com", depth: 0, pageId: 1, etag: "abc123" }, + { + url: "https://docs.example.com/guide", + depth: 1, + pageId: 2, + etag: "def456", + }, + { url: "https://docs.example.com/api", depth: 1, pageId: 3, etag: "ghi789" }, + // ... all other indexed pages +]; +``` + +The **depth value is preserved** from the original scrape. This ensures: + +- Pages respect `maxDepth` limits during refresh +- Depth-based filtering works consistently +- Progress tracking shows accurate depth information + +### New Page Discovery + +When refresh discovers a new page (not in `initialQueue`): + +1. Calculate depth based on parent page: `parent.depth + 1` +2. Assign no `pageId` (will be created during database insert) +3. Process through full pipeline as a new page + +### Root URL Handling + +The root URL is **always processed**, even if it appears in `initialQueue`: + +1. Ensures the entry point is always checked +2. Allows detection of top-level navigation changes +3. Serves as the canonical base for link resolution + +The `BaseScraperStrategy` ensures the root URL appears exactly once in the queue, either from `initialQueue` or added explicitly. + +--- + +## Strategy-Specific Behavior + +Different scraping strategies handle refresh operations differently based on their data sources: + +### WebScraperStrategy + +**ETag Source:** HTTP `ETag` header from web servers + +**Refresh Characteristics:** + +- Most efficient with modern web servers and CDNs +- Supports conditional requests natively +- Handles redirects by updating canonical URLs +- Discovers new pages through link following + +**Example Scenario:** + +``` +Initial: https://docs.example.com/v1.0/guide +After Redirect: https://docs.example.com/v2.0/guide +Action: Update canonical URL, check ETag, process if changed +``` + +### LocalFileStrategy + +**ETag Source:** File modification time (mtime) converted to ISO string + +**Refresh Characteristics:** + +- Uses filesystem metadata instead of HTTP +- Detects file modifications via mtime comparison +- Discovers new files by scanning directories +- Handles file deletions through missing file detection (ENOENT) + +**Trade-offs:** + +- mtime less granular than HTTP ETags +- Directory structures must be re-scanned fully +- No network overhead (local filesystem) + +### GitHubScraperStrategy + +**ETag Source:** Varies by content type + +**Refresh Characteristics:** + +- Wiki pages: HTTP ETags from GitHub's web interface +- Repository files: GitHub API ETags for raw content +- Mixed approach: Wiki content via web, files via raw.githubusercontent.com + +**Complex Scenarios:** + +- Root URL discovery returns both wiki URL and file URLs +- Wiki refresh follows standard web strategy +- File refresh checks individual file ETags from raw.githubusercontent.com + +**Example Flow:** + +``` +Root: https://github.com/user/repo + ↓ +Discovers: https://github.com/user/repo/wiki (returns 304 or 200) +Discovers: File URLs as HTTPS blob URLs (e.g., /blob/main/README.md) +``` + +--- + +## Database Operations + +### Update Patterns + +Refresh operations perform different database operations based on status: + +**304 Not Modified:** + +- No database changes - content and metadata remain unchanged +- Strategy simply continues to next page in queue + +**200 OK (Modified Content):** + +1. Delete old document chunks for the page +2. Update page metadata via UPSERT (title, etag, last_modified, content_type, depth) +3. Insert new document chunks +4. Update vector embeddings for new chunks + +**404 Not Found:** + +1. Delete all document chunks for the page +2. Delete the page record itself + +**New Page (200 OK, no pageId):** + +1. Insert new page record +2. Insert document chunks +3. Generate and store vector embeddings + +### Concurrency Handling + +The refresh system processes multiple pages concurrently (default: 3 workers). Database operations are: + +- **Atomic** - Each page update is a single transaction in PipelineWorker +- **Isolated** - No cross-page dependencies +- **Idempotent** - Delete + Insert pattern is safe to retry on failure + +The `visited` set in `BaseScraperStrategy` prevents duplicate processing across concurrent workers. + +--- + +## Performance Characteristics + +### Bandwidth Savings + +**Typical documentation site refresh:** + +- 70-90% of pages unchanged (return 304) +- 5-10% of pages modified (return 200) +- 1-5% of pages deleted (return 404) +- <5% of pages newly added + +**Bandwidth reduction:** + +- 304 responses: ~1KB (headers only) +- 200 responses: Full page size +- Net reduction: 70-90% compared to full re-indexing + +### Processing Time + +**Time spent per page:** + +- 304: <50ms (HTTP request + no database changes) +- 200: 500-2000ms (fetch + pipeline + chunking + embeddings) +- 404: <100ms (HTTP request + document deletion) + +**Overall speedup:** + +- Sites with few changes: 5-10x faster than re-indexing +- Sites with many changes: Approaches re-indexing time +- Sweet spot: Weekly/monthly refresh of active documentation + +### Network Efficiency + +**Request patterns:** + +- Single HTTP request per page (no redundant fetches) +- Conditional requests leverage CDN caching +- Failed requests don't retry (404 is definitive) +- Concurrent requests respect `maxConcurrency` limit + +--- + +## Design Trade-offs + +### Full Re-Crawl vs. Stored-Only Check + +**Decision:** Always re-crawl from root, even during refresh + +**Trade-off:** + +- ✅ Discovers new pages automatically +- ✅ Detects navigation changes +- ✅ Removes orphaned pages +- ❌ Requires checking all known pages (even if 304) +- ❌ Network requests for unchanged pages + +**Rationale:** Correctness over performance. The conditional request mechanism mitigates the performance cost while ensuring complete coverage. + +### Hard Deletion vs. Soft Deletion + +**Decision:** Hard delete both document chunks and page records + +**Trade-off:** + +- ✅ Deleted content immediately removed from search +- ✅ Page records completely removed, preventing database bloat +- ✅ Simple implementation (no query filtering needed) +- ✅ Clean database state with no orphaned page records +- ❌ Document chunks and page metadata cannot be recovered +- ❌ No historical tracking of deleted pages + +**Rationale:** Search accuracy is paramount. Deleted content must not appear in results. Complete deletion ensures database remains clean and doesn't accumulate empty page records over time. The page metadata loss is acceptable since deleted pages are no longer relevant to the documentation. + +### ETag Storage per Page + +**Decision:** Store ETags in pages table, not separate cache + +**Trade-off:** + +- ✅ Simple schema, no joins required +- ✅ Atomic updates (page + ETag together) +- ✅ ETag tied to content version +- ❌ Larger pages table +- ❌ ETag duplication if same content on multiple URLs + +**Rationale:** Simplicity and correctness. ETags are intrinsically tied to content versions, not URLs. + +--- + +## Testing Strategy + +Refresh behavior is tested at multiple levels: + +### Unit Tests (Strategy Level) + +Each strategy's test suite includes refresh scenarios: + +- Pages returning 304 (skip processing) +- Pages returning 200 (re-process) +- Pages returning 404 (mark deleted) +- New pages discovered during refresh +- Depth preservation from initialQueue + +**Example:** `LocalFileStrategy.test.ts` refresh workflow tests + +### Integration Tests (Pipeline Level) + +End-to-end refresh workflows: + +- Multi-page refresh with mixed statuses +- Concurrent refresh operations +- Database consistency after refresh +- Link discovery and depth handling + +**Example:** `test/refresh-pipeline-e2e.test.ts` + +### Real-World Scenarios + +Testing against actual documentation sites: + +- GitHub repositories with wiki + files +- NPM package documentation +- Local file hierarchies with modifications + +These tests validate that the refresh system handles real content structures correctly. + +--- + +## Future Enhancements + +Potential improvements to the refresh system: + +### Incremental Refresh + +Only check pages modified since last refresh based on timestamps: + +- Reduces network requests further +- Risk: Miss changes on infrequently checked pages +- Requires careful timestamp management + +### Parallel Strategy Execution + +Run multiple strategies simultaneously for multi-source documentation: + +- Example: GitHub repo files + NPM registry + official docs +- Requires coordination across strategies +- Complex dependency management + +### Smart Re-crawl Scheduling + +Adjust refresh frequency based on historical change patterns: + +- Stable pages: Check less frequently +- Volatile pages: Check more frequently +- Requires tracking change history per page + +### Webhook-Based Updates + +Trigger refresh on content update notifications: + +- GitHub webhooks for repository changes +- CMS webhooks for documentation updates +- Eliminates polling, reduces latency +- Requires webhook infrastructure + +--- + +## Summary + +The refresh architecture achieves **efficient re-indexing** through: + +1. **Conditional HTTP requests** - Let servers decide what changed +2. **Full re-crawl** - Ensure complete coverage despite conditional requests +3. **Status-based handling** - Different actions for 304/200/404 +4. **Depth preservation** - Maintain original discovery structure +5. **Unified pipeline** - Same code paths as initial scraping + +This design balances **performance** (skip unchanged content) with **correctness** (discover all changes) while maintaining **simplicity** (reuse existing infrastructure). + +Refresh is not a separate system - it's the same scraping pipeline with smarter change detection. diff --git a/package-lock.json b/package-lock.json index 4ab05410..8184dbe5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,7 +11,7 @@ "license": "MIT", "dependencies": { "@fastify/formbody": "^8.0.2", - "@fastify/static": "^8.2.0", + "@fastify/static": "^8.3.0", "@joplin/turndown-plugin-gfm": "^1.0.62", "@kitajs/html": "^4.2.9", "@kitajs/ts-html-plugin": "^4.1.1", @@ -19,57 +19,57 @@ "@langchain/google-genai": "^0.2.16", "@langchain/google-vertexai": "^0.2.16", "@langchain/openai": "^0.6.3", - "@modelcontextprotocol/sdk": "^1.17.1", - "@trpc/client": "^11.4.4", + "@modelcontextprotocol/sdk": "^1.20.2", + "@trpc/client": "^11.7.1", "@trpc/server": "^11.4.4", "alpinejs": "^3.14.9", - "axios": "^1.11.0", + "axios": "^1.13.1", "axios-retry": "^4.5.0", - "better-sqlite3": "^12.2.0", + "better-sqlite3": "^12.4.1", "cheerio": "^1.1.2", "commander": "^14.0.0", - "dompurify": "^3.2.6", - "dotenv": "^17.2.1", + "dompurify": "^3.3.0", + "dotenv": "^17.2.3", "env-paths": "^3.0.0", - "fastify": "^5.4.0", + "fastify": "^5.6.1", "flowbite": "^3.1.2", "fuse.js": "^7.1.0", - "header-generator": "^2.1.69", + "header-generator": "^2.1.76", "htmx.org": "^2.0.6", "iconv-lite": "^0.6.3", - "jose": "^6.0.12", + "jose": "^6.1.0", "jsdom": "^26.1.0", "langchain": "^0.3.30", - "mime": "^4.0.7", + "mime": "^4.1.0", "minimatch": "^10.0.1", "playwright": "^1.52.0", - "posthog-node": "^5.7.0", + "posthog-node": "^5.11.0", "psl": "^1.15.0", "remark": "^15.0.1", "remark-gfm": "^4.0.1", "remark-html": "^16.0.1", - "semver": "^7.7.2", + "semver": "^7.7.3", "sqlite-vec": "^0.1.7-alpha.2", "tree-sitter": "^0.21.1", "tree-sitter-javascript": "^0.23.1", "tree-sitter-python": "^0.21.0", "tree-sitter-typescript": "^0.23.2", - "turndown": "^7.2.0", - "zod": "^4.0.14" + "turndown": "^7.2.2", + "zod": "^4.1.12" }, "bin": { "docs-mcp-server": "dist/index.js" }, "devDependencies": { - "@biomejs/biome": "^2.1.3", + "@biomejs/biome": "^2.3.2", "@commitlint/cli": "^19.8.1", "@commitlint/config-conventional": "^19.8.1", "@semantic-release/changelog": "^6.0.3", "@semantic-release/git": "^10.0.1", - "@semantic-release/github": "^11.0.3", + "@semantic-release/github": "^11.0.6", "@semantic-release/npm": "^12.0.2", - "@tailwindcss/postcss": "^4.1.11", - "@tailwindcss/vite": "^4.1.11", + "@tailwindcss/postcss": "^4.1.16", + "@tailwindcss/vite": "^4.1.16", "@types/alpinejs": "^3.13.11", "@types/better-sqlite3": "^7.6.13", "@types/jsdom": "~21.1.7", @@ -77,18 +77,19 @@ "@types/node": "^24.1.0", "@types/node-fetch": "^2.6.13", "@types/psl": "^1.1.3", - "@types/semver": "^7.7.0", - "@types/turndown": "^5.0.5", + "@types/semver": "^7.7.1", + "@types/turndown": "^5.0.6", "autoprefixer": "^10.4.21", "flowbite-typography": "^1.0.5", "husky": "^9.1.7", - "lint-staged": "^16.1.2", - "memfs": "^4.34.0", + "lint-staged": "^16.2.6", + "memfs": "^4.50.0", + "nock": "^14.0.10", "npm-run-all": "^4.1.5", "postcss": "^8.5.6", - "semantic-release": "^24.2.7", + "semantic-release": "^24.2.9", "tailwindcss": "^4.1.4", - "typescript": "^5.9.2", + "typescript": "^5.9.3", "vite": "^6.3.5", "vite-node": "^3.1.2", "vite-plugin-dts": "^4.5.4", @@ -1008,9 +1009,9 @@ } }, "node_modules/@biomejs/biome": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/biome/-/biome-2.2.0.tgz", - "integrity": "sha512-3On3RSYLsX+n9KnoSgfoYlckYBoU6VRM22cw1gB4Y0OuUVSYd/O/2saOJMrA4HFfA1Ff0eacOvMN1yAAvHtzIw==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/biome/-/biome-2.3.2.tgz", + "integrity": "sha512-8e9tzamuDycx7fdrcJ/F/GDZ8SYukc5ud6tDicjjFqURKYFSWMl0H0iXNXZEGmcmNUmABgGuHThPykcM41INgg==", "dev": true, "license": "MIT OR Apache-2.0", "bin": { @@ -1024,20 +1025,20 @@ "url": "https://opencollective.com/biome" }, "optionalDependencies": { - "@biomejs/cli-darwin-arm64": "2.2.0", - "@biomejs/cli-darwin-x64": "2.2.0", - "@biomejs/cli-linux-arm64": "2.2.0", - "@biomejs/cli-linux-arm64-musl": "2.2.0", - "@biomejs/cli-linux-x64": "2.2.0", - "@biomejs/cli-linux-x64-musl": "2.2.0", - "@biomejs/cli-win32-arm64": "2.2.0", - "@biomejs/cli-win32-x64": "2.2.0" + "@biomejs/cli-darwin-arm64": "2.3.2", + "@biomejs/cli-darwin-x64": "2.3.2", + "@biomejs/cli-linux-arm64": "2.3.2", + "@biomejs/cli-linux-arm64-musl": "2.3.2", + "@biomejs/cli-linux-x64": "2.3.2", + "@biomejs/cli-linux-x64-musl": "2.3.2", + "@biomejs/cli-win32-arm64": "2.3.2", + "@biomejs/cli-win32-x64": "2.3.2" } }, "node_modules/@biomejs/cli-darwin-arm64": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/cli-darwin-arm64/-/cli-darwin-arm64-2.2.0.tgz", - "integrity": "sha512-zKbwUUh+9uFmWfS8IFxmVD6XwqFcENjZvEyfOxHs1epjdH3wyyMQG80FGDsmauPwS2r5kXdEM0v/+dTIA9FXAg==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/cli-darwin-arm64/-/cli-darwin-arm64-2.3.2.tgz", + "integrity": "sha512-4LECm4kc3If0JISai4c3KWQzukoUdpxy4fRzlrPcrdMSRFksR9ZoXK7JBcPuLBmd2SoT4/d7CQS33VnZpgBjew==", "cpu": [ "arm64" ], @@ -1052,9 +1053,9 @@ } }, "node_modules/@biomejs/cli-darwin-x64": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/cli-darwin-x64/-/cli-darwin-x64-2.2.0.tgz", - "integrity": "sha512-+OmT4dsX2eTfhD5crUOPw3RPhaR+SKVspvGVmSdZ9y9O/AgL8pla6T4hOn1q+VAFBHuHhsdxDRJgFCSC7RaMOw==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/cli-darwin-x64/-/cli-darwin-x64-2.3.2.tgz", + "integrity": "sha512-jNMnfwHT4N3wi+ypRfMTjLGnDmKYGzxVr1EYAPBcauRcDnICFXN81wD6wxJcSUrLynoyyYCdfW6vJHS/IAoTDA==", "cpu": [ "x64" ], @@ -1069,9 +1070,9 @@ } }, "node_modules/@biomejs/cli-linux-arm64": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-arm64/-/cli-linux-arm64-2.2.0.tgz", - "integrity": "sha512-6eoRdF2yW5FnW9Lpeivh7Mayhq0KDdaDMYOJnH9aT02KuSIX5V1HmWJCQQPwIQbhDh68Zrcpl8inRlTEan0SXw==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-arm64/-/cli-linux-arm64-2.3.2.tgz", + "integrity": "sha512-amnqvk+gWybbQleRRq8TMe0rIv7GHss8mFJEaGuEZYWg1Tw14YKOkeo8h6pf1c+d3qR+JU4iT9KXnBKGON4klw==", "cpu": [ "arm64" ], @@ -1086,9 +1087,9 @@ } }, "node_modules/@biomejs/cli-linux-arm64-musl": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-arm64-musl/-/cli-linux-arm64-musl-2.2.0.tgz", - "integrity": "sha512-egKpOa+4FL9YO+SMUMLUvf543cprjevNc3CAgDNFLcjknuNMcZ0GLJYa3EGTCR2xIkIUJDVneBV3O9OcIlCEZQ==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-arm64-musl/-/cli-linux-arm64-musl-2.3.2.tgz", + "integrity": "sha512-2Zz4usDG1GTTPQnliIeNx6eVGGP2ry5vE/v39nT73a3cKN6t5H5XxjcEoZZh62uVZvED7hXXikclvI64vZkYqw==", "cpu": [ "arm64" ], @@ -1103,9 +1104,9 @@ } }, "node_modules/@biomejs/cli-linux-x64": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-x64/-/cli-linux-x64-2.2.0.tgz", - "integrity": "sha512-5UmQx/OZAfJfi25zAnAGHUMuOd+LOsliIt119x2soA2gLggQYrVPA+2kMUxR6Mw5M1deUF/AWWP2qpxgH7Nyfw==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-x64/-/cli-linux-x64-2.3.2.tgz", + "integrity": "sha512-8BG/vRAhFz1pmuyd24FQPhNeueLqPtwvZk6yblABY2gzL2H8fLQAF/Z2OPIc+BPIVPld+8cSiKY/KFh6k81xfA==", "cpu": [ "x64" ], @@ -1120,9 +1121,9 @@ } }, "node_modules/@biomejs/cli-linux-x64-musl": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-x64-musl/-/cli-linux-x64-musl-2.2.0.tgz", - "integrity": "sha512-I5J85yWwUWpgJyC1CcytNSGusu2p9HjDnOPAFG4Y515hwRD0jpR9sT9/T1cKHtuCvEQ/sBvx+6zhz9l9wEJGAg==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-x64-musl/-/cli-linux-x64-musl-2.3.2.tgz", + "integrity": "sha512-gzB19MpRdTuOuLtPpFBGrV3Lq424gHyq2lFj8wfX9tvLMLdmA/R9C7k/mqBp/spcbWuHeIEKgEs3RviOPcWGBA==", "cpu": [ "x64" ], @@ -1137,9 +1138,9 @@ } }, "node_modules/@biomejs/cli-win32-arm64": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/cli-win32-arm64/-/cli-win32-arm64-2.2.0.tgz", - "integrity": "sha512-n9a1/f2CwIDmNMNkFs+JI0ZjFnMO0jdOyGNtihgUNFnlmd84yIYY2KMTBmMV58ZlVHjgmY5Y6E1hVTnSRieggA==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/cli-win32-arm64/-/cli-win32-arm64-2.3.2.tgz", + "integrity": "sha512-lCruqQlfWjhMlOdyf5pDHOxoNm4WoyY2vZ4YN33/nuZBRstVDuqPPjS0yBkbUlLEte11FbpW+wWSlfnZfSIZvg==", "cpu": [ "arm64" ], @@ -1154,9 +1155,9 @@ } }, "node_modules/@biomejs/cli-win32-x64": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@biomejs/cli-win32-x64/-/cli-win32-x64-2.2.0.tgz", - "integrity": "sha512-Nawu5nHjP/zPKTIryh2AavzTc/KEg4um/MxWdXW0A6P/RZOyIpa7+QSjeXwAwX/utJGaCoXRPWtF3m5U/bB3Ww==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/@biomejs/cli-win32-x64/-/cli-win32-x64-2.3.2.tgz", + "integrity": "sha512-6Ee9P26DTb4D8sN9nXxgbi9Dw5vSOfH98M7UlmkjKB2vtUbrRqCbZiNfryGiwnPIpd6YUoTl7rLVD2/x1CyEHQ==", "cpu": [ "x64" ], @@ -2160,9 +2161,9 @@ } }, "node_modules/@fastify/static": { - "version": "8.2.0", - "resolved": "https://registry.npmjs.org/@fastify/static/-/static-8.2.0.tgz", - "integrity": "sha512-PejC/DtT7p1yo3p+W7LiUtLMsV8fEvxAK15sozHy9t8kwo5r0uLYmhV/inURmGz1SkHZFz/8CNtHLPyhKcx4SQ==", + "version": "8.3.0", + "resolved": "https://registry.npmjs.org/@fastify/static/-/static-8.3.0.tgz", + "integrity": "sha512-yKxviR5PH1OKNnisIzZKmgZSus0r2OZb8qCSbqmw34aolT4g3UlzYfeBRym+HJ1J471CR8e2ldNub4PubD1coA==", "funding": [ { "type": "github", @@ -2230,19 +2231,6 @@ "node": ">=12" } }, - "node_modules/@isaacs/fs-minipass": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/@isaacs/fs-minipass/-/fs-minipass-4.0.1.tgz", - "integrity": "sha512-wgm9Ehl2jpeqP3zw/7mo3kRHFp5MEDhqAdwy1fTGkHAwnkGOVsgpvQhL8B5n1qlb01jV3n/bI0ZfZp5lWA1k4w==", - "dev": true, - "license": "ISC", - "dependencies": { - "minipass": "^7.0.4" - }, - "engines": { - "node": ">=18.0.0" - } - }, "node_modules/@joplin/turndown-plugin-gfm": { "version": "1.0.62", "resolved": "https://registry.npmjs.org/@joplin/turndown-plugin-gfm/-/turndown-plugin-gfm-1.0.62.tgz", @@ -2289,9 +2277,9 @@ "license": "MIT" }, "node_modules/@jridgewell/trace-mapping": { - "version": "0.3.30", - "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.30.tgz", - "integrity": "sha512-GQ7Nw5G2lTu/BtHTKfXhKHok2WGetd4XYcVKGx00SjAk8GMwgJM3zr6zORiPGuOE+/vkc90KtTosSSvaCjKb2Q==", + "version": "0.3.31", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz", + "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==", "dev": true, "license": "MIT", "dependencies": { @@ -2927,9 +2915,9 @@ "license": "BSD-2-Clause" }, "node_modules/@modelcontextprotocol/sdk": { - "version": "1.17.3", - "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-1.17.3.tgz", - "integrity": "sha512-JPwUKWSsbzx+DLFznf/QZ32Qa+ptfbUlHhRLrBQBAFu9iI1iYvizM4p+zhhRDceSsPutXp4z+R/HPVphlIiclg==", + "version": "1.20.2", + "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-1.20.2.tgz", + "integrity": "sha512-6rqTdFt67AAAzln3NOKsXRmv5ZzPkgbfaebKBqUbts7vK1GZudqnrun5a8d3M/h955cam9RHZ6Jb4Y1XhnmFPg==", "license": "MIT", "dependencies": { "ajv": "^6.12.6", @@ -2989,42 +2977,22 @@ "zod": "^3.24.1" } }, - "node_modules/@nodelib/fs.scandir": { - "version": "2.1.5", - "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", - "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==", + "node_modules/@mswjs/interceptors": { + "version": "0.39.8", + "resolved": "https://registry.npmjs.org/@mswjs/interceptors/-/interceptors-0.39.8.tgz", + "integrity": "sha512-2+BzZbjRO7Ct61k8fMNHEtoKjeWI9pIlHFTqBwZ5icHpqszIgEZbjb1MW5Z0+bITTCTl3gk4PDBxs9tA/csXvA==", "dev": true, "license": "MIT", "dependencies": { - "@nodelib/fs.stat": "2.0.5", - "run-parallel": "^1.1.9" + "@open-draft/deferred-promise": "^2.2.0", + "@open-draft/logger": "^0.3.0", + "@open-draft/until": "^2.0.0", + "is-node-process": "^1.2.0", + "outvariant": "^1.4.3", + "strict-event-emitter": "^0.5.1" }, "engines": { - "node": ">= 8" - } - }, - "node_modules/@nodelib/fs.stat": { - "version": "2.0.5", - "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz", - "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 8" - } - }, - "node_modules/@nodelib/fs.walk": { - "version": "1.2.8", - "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz", - "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==", - "dev": true, - "license": "MIT", - "dependencies": { - "@nodelib/fs.scandir": "2.1.5", - "fastq": "^1.6.0" - }, - "engines": { - "node": ">= 8" + "node": ">=18" } }, "node_modules/@octokit/auth-token": { @@ -3183,6 +3151,31 @@ "@octokit/openapi-types": "^25.1.0" } }, + "node_modules/@open-draft/deferred-promise": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@open-draft/deferred-promise/-/deferred-promise-2.2.0.tgz", + "integrity": "sha512-CecwLWx3rhxVQF6V4bAgPS5t+So2sTbPgAzafKkVizyi7tlwpcFpdFqq+wqF2OwNBmqFuu6tOyouTuxgpMfzmA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@open-draft/logger": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/@open-draft/logger/-/logger-0.3.0.tgz", + "integrity": "sha512-X2g45fzhxH238HKO4xbSr7+wBS8Fvw6ixhTDuvLd5mqh6bJJCFAPwU9mPDxbcrRtfxv4u5IHCEH77BmxvXmmxQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-node-process": "^1.2.0", + "outvariant": "^1.4.0" + } + }, + "node_modules/@open-draft/until": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/@open-draft/until/-/until-2.1.0.tgz", + "integrity": "sha512-U69T3ItWHvLwGg5eJ0n3I62nWuE6ilHlmz7zM0npLBRvPRd7e6NYmg54vvRtP5mZG7kZqZCFVdsTWo7BPtBujg==", + "dev": true, + "license": "MIT" + }, "node_modules/@pnpm/config.env-replace": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/@pnpm/config.env-replace/-/config.env-replace-1.1.0.tgz", @@ -3239,9 +3232,9 @@ } }, "node_modules/@posthog/core": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/@posthog/core/-/core-1.0.0.tgz", - "integrity": "sha512-gquQld+duT9DdzLIFoHZkUMW0DZOTSLCtSjuuC/zKFz65Qecbz9p37DHBJMkw0dCuB8Mgh2GtH8Ag3PznJrP3g==", + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@posthog/core/-/core-1.5.0.tgz", + "integrity": "sha512-oxfV20QMNwH30jKybUyqi3yGuMghULQz1zkJgQG3rjpHDxhD2vDN6E7UpmaqgphMIvGG3Q+DgfU10zfSPA7w7w==", "license": "MIT" }, "node_modules/@rollup/plugin-node-resolve": { @@ -3855,9 +3848,9 @@ } }, "node_modules/@semantic-release/github": { - "version": "11.0.4", - "resolved": "https://registry.npmjs.org/@semantic-release/github/-/github-11.0.4.tgz", - "integrity": "sha512-fU/nLSjkp9DmB0h7FVO5imhhWJMvq2LjD4+3lz3ZAzpDLY9+KYwC+trJ+g7LbZeJv9y3L9fSFSg2DduUpiT6bw==", + "version": "11.0.6", + "resolved": "https://registry.npmjs.org/@semantic-release/github/-/github-11.0.6.tgz", + "integrity": "sha512-ctDzdSMrT3H+pwKBPdyCPty6Y47X8dSrjd3aPZ5KKIKKWTwZBE9De8GtsH3TyAlw3Uyo2stegMx6rJMXKpJwJA==", "dev": true, "license": "MIT", "dependencies": { @@ -3869,13 +3862,13 @@ "aggregate-error": "^5.0.0", "debug": "^4.3.4", "dir-glob": "^3.0.1", - "globby": "^14.0.0", "http-proxy-agent": "^7.0.0", "https-proxy-agent": "^7.0.0", "issue-parser": "^7.0.0", "lodash-es": "^4.17.21", "mime": "^4.0.0", "p-filter": "^4.0.0", + "tinyglobby": "^0.2.14", "url-join": "^5.0.0" }, "engines": { @@ -4266,19 +4259,6 @@ "url": "https://github.com/sindresorhus/is?sponsor=1" } }, - "node_modules/@sindresorhus/merge-streams": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/@sindresorhus/merge-streams/-/merge-streams-2.3.0.tgz", - "integrity": "sha512-LtoMMhxAlorcGhmFYI+LhPgbPZCkgP6ra1YL604EeF6U98pLlQ3iWIGMdWSC+vWmPBWBNgmDBAhnAobLROJmwg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/@smithy/abort-controller": { "version": "4.0.5", "resolved": "https://registry.npmjs.org/@smithy/abort-controller/-/abort-controller-4.0.5.tgz", @@ -4920,54 +4900,49 @@ } }, "node_modules/@tailwindcss/node": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/node/-/node-4.1.12.tgz", - "integrity": "sha512-3hm9brwvQkZFe++SBt+oLjo4OLDtkvlE8q2WalaD/7QWaeM7KEJbAiY/LJZUaCs7Xa8aUu4xy3uoyX4q54UVdQ==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/node/-/node-4.1.16.tgz", + "integrity": "sha512-BX5iaSsloNuvKNHRN3k2RcCuTEgASTo77mofW0vmeHkfrDWaoFAFvNHpEgtu0eqyypcyiBkDWzSMxJhp3AUVcw==", "dev": true, "license": "MIT", "dependencies": { "@jridgewell/remapping": "^2.3.4", "enhanced-resolve": "^5.18.3", - "jiti": "^2.5.1", - "lightningcss": "1.30.1", - "magic-string": "^0.30.17", + "jiti": "^2.6.1", + "lightningcss": "1.30.2", + "magic-string": "^0.30.19", "source-map-js": "^1.2.1", - "tailwindcss": "4.1.12" + "tailwindcss": "4.1.16" } }, "node_modules/@tailwindcss/oxide": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.12.tgz", - "integrity": "sha512-gM5EoKHW/ukmlEtphNwaGx45fGoEmP10v51t9unv55voWh6WrOL19hfuIdo2FjxIaZzw776/BUQg7Pck++cIVw==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.16.tgz", + "integrity": "sha512-2OSv52FRuhdlgyOQqgtQHuCgXnS8nFSYRp2tJ+4WZXKgTxqPy7SMSls8c3mPT5pkZ17SBToGM5LHEJBO7miEdg==", "dev": true, - "hasInstallScript": true, "license": "MIT", - "dependencies": { - "detect-libc": "^2.0.4", - "tar": "^7.4.3" - }, "engines": { "node": ">= 10" }, "optionalDependencies": { - "@tailwindcss/oxide-android-arm64": "4.1.12", - "@tailwindcss/oxide-darwin-arm64": "4.1.12", - "@tailwindcss/oxide-darwin-x64": "4.1.12", - "@tailwindcss/oxide-freebsd-x64": "4.1.12", - "@tailwindcss/oxide-linux-arm-gnueabihf": "4.1.12", - "@tailwindcss/oxide-linux-arm64-gnu": "4.1.12", - "@tailwindcss/oxide-linux-arm64-musl": "4.1.12", - "@tailwindcss/oxide-linux-x64-gnu": "4.1.12", - "@tailwindcss/oxide-linux-x64-musl": "4.1.12", - "@tailwindcss/oxide-wasm32-wasi": "4.1.12", - "@tailwindcss/oxide-win32-arm64-msvc": "4.1.12", - "@tailwindcss/oxide-win32-x64-msvc": "4.1.12" + "@tailwindcss/oxide-android-arm64": "4.1.16", + "@tailwindcss/oxide-darwin-arm64": "4.1.16", + "@tailwindcss/oxide-darwin-x64": "4.1.16", + "@tailwindcss/oxide-freebsd-x64": "4.1.16", + "@tailwindcss/oxide-linux-arm-gnueabihf": "4.1.16", + "@tailwindcss/oxide-linux-arm64-gnu": "4.1.16", + "@tailwindcss/oxide-linux-arm64-musl": "4.1.16", + "@tailwindcss/oxide-linux-x64-gnu": "4.1.16", + "@tailwindcss/oxide-linux-x64-musl": "4.1.16", + "@tailwindcss/oxide-wasm32-wasi": "4.1.16", + "@tailwindcss/oxide-win32-arm64-msvc": "4.1.16", + "@tailwindcss/oxide-win32-x64-msvc": "4.1.16" } }, "node_modules/@tailwindcss/oxide-android-arm64": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-android-arm64/-/oxide-android-arm64-4.1.12.tgz", - "integrity": "sha512-oNY5pq+1gc4T6QVTsZKwZaGpBb2N1H1fsc1GD4o7yinFySqIuRZ2E4NvGasWc6PhYJwGK2+5YT1f9Tp80zUQZQ==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-android-arm64/-/oxide-android-arm64-4.1.16.tgz", + "integrity": "sha512-8+ctzkjHgwDJ5caq9IqRSgsP70xhdhJvm+oueS/yhD5ixLhqTw9fSL1OurzMUhBwE5zK26FXLCz2f/RtkISqHA==", "cpu": [ "arm64" ], @@ -4982,9 +4957,9 @@ } }, "node_modules/@tailwindcss/oxide-darwin-arm64": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-darwin-arm64/-/oxide-darwin-arm64-4.1.12.tgz", - "integrity": "sha512-cq1qmq2HEtDV9HvZlTtrj671mCdGB93bVY6J29mwCyaMYCP/JaUBXxrQQQm7Qn33AXXASPUb2HFZlWiiHWFytw==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-darwin-arm64/-/oxide-darwin-arm64-4.1.16.tgz", + "integrity": "sha512-C3oZy5042v2FOALBZtY0JTDnGNdS6w7DxL/odvSny17ORUnaRKhyTse8xYi3yKGyfnTUOdavRCdmc8QqJYwFKA==", "cpu": [ "arm64" ], @@ -4999,9 +4974,9 @@ } }, "node_modules/@tailwindcss/oxide-darwin-x64": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-darwin-x64/-/oxide-darwin-x64-4.1.12.tgz", - "integrity": "sha512-6UCsIeFUcBfpangqlXay9Ffty9XhFH1QuUFn0WV83W8lGdX8cD5/+2ONLluALJD5+yJ7k8mVtwy3zMZmzEfbLg==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-darwin-x64/-/oxide-darwin-x64-4.1.16.tgz", + "integrity": "sha512-vjrl/1Ub9+JwU6BP0emgipGjowzYZMjbWCDqwA2Z4vCa+HBSpP4v6U2ddejcHsolsYxwL5r4bPNoamlV0xDdLg==", "cpu": [ "x64" ], @@ -5016,9 +4991,9 @@ } }, "node_modules/@tailwindcss/oxide-freebsd-x64": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-freebsd-x64/-/oxide-freebsd-x64-4.1.12.tgz", - "integrity": "sha512-JOH/f7j6+nYXIrHobRYCtoArJdMJh5zy5lr0FV0Qu47MID/vqJAY3r/OElPzx1C/wdT1uS7cPq+xdYYelny1ww==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-freebsd-x64/-/oxide-freebsd-x64-4.1.16.tgz", + "integrity": "sha512-TSMpPYpQLm+aR1wW5rKuUuEruc/oOX3C7H0BTnPDn7W/eMw8W+MRMpiypKMkXZfwH8wqPIRKppuZoedTtNj2tg==", "cpu": [ "x64" ], @@ -5033,9 +5008,9 @@ } }, "node_modules/@tailwindcss/oxide-linux-arm-gnueabihf": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm-gnueabihf/-/oxide-linux-arm-gnueabihf-4.1.12.tgz", - "integrity": "sha512-v4Ghvi9AU1SYgGr3/j38PD8PEe6bRfTnNSUE3YCMIRrrNigCFtHZ2TCm8142X8fcSqHBZBceDx+JlFJEfNg5zQ==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm-gnueabihf/-/oxide-linux-arm-gnueabihf-4.1.16.tgz", + "integrity": "sha512-p0GGfRg/w0sdsFKBjMYvvKIiKy/LNWLWgV/plR4lUgrsxFAoQBFrXkZ4C0w8IOXfslB9vHK/JGASWD2IefIpvw==", "cpu": [ "arm" ], @@ -5050,9 +5025,9 @@ } }, "node_modules/@tailwindcss/oxide-linux-arm64-gnu": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm64-gnu/-/oxide-linux-arm64-gnu-4.1.12.tgz", - "integrity": "sha512-YP5s1LmetL9UsvVAKusHSyPlzSRqYyRB0f+Kl/xcYQSPLEw/BvGfxzbH+ihUciePDjiXwHh+p+qbSP3SlJw+6g==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm64-gnu/-/oxide-linux-arm64-gnu-4.1.16.tgz", + "integrity": "sha512-DoixyMmTNO19rwRPdqviTrG1rYzpxgyYJl8RgQvdAQUzxC1ToLRqtNJpU/ATURSKgIg6uerPw2feW0aS8SNr/w==", "cpu": [ "arm64" ], @@ -5067,9 +5042,9 @@ } }, "node_modules/@tailwindcss/oxide-linux-arm64-musl": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm64-musl/-/oxide-linux-arm64-musl-4.1.12.tgz", - "integrity": "sha512-V8pAM3s8gsrXcCv6kCHSuwyb/gPsd863iT+v1PGXC4fSL/OJqsKhfK//v8P+w9ThKIoqNbEnsZqNy+WDnwQqCA==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm64-musl/-/oxide-linux-arm64-musl-4.1.16.tgz", + "integrity": "sha512-H81UXMa9hJhWhaAUca6bU2wm5RRFpuHImrwXBUvPbYb+3jo32I9VIwpOX6hms0fPmA6f2pGVlybO6qU8pF4fzQ==", "cpu": [ "arm64" ], @@ -5084,9 +5059,9 @@ } }, "node_modules/@tailwindcss/oxide-linux-x64-gnu": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-x64-gnu/-/oxide-linux-x64-gnu-4.1.12.tgz", - "integrity": "sha512-xYfqYLjvm2UQ3TZggTGrwxjYaLB62b1Wiysw/YE3Yqbh86sOMoTn0feF98PonP7LtjsWOWcXEbGqDL7zv0uW8Q==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-x64-gnu/-/oxide-linux-x64-gnu-4.1.16.tgz", + "integrity": "sha512-ZGHQxDtFC2/ruo7t99Qo2TTIvOERULPl5l0K1g0oK6b5PGqjYMga+FcY1wIUnrUxY56h28FxybtDEla+ICOyew==", "cpu": [ "x64" ], @@ -5101,9 +5076,9 @@ } }, "node_modules/@tailwindcss/oxide-linux-x64-musl": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-x64-musl/-/oxide-linux-x64-musl-4.1.12.tgz", - "integrity": "sha512-ha0pHPamN+fWZY7GCzz5rKunlv9L5R8kdh+YNvP5awe3LtuXb5nRi/H27GeL2U+TdhDOptU7T6Is7mdwh5Ar3A==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-x64-musl/-/oxide-linux-x64-musl-4.1.16.tgz", + "integrity": "sha512-Oi1tAaa0rcKf1Og9MzKeINZzMLPbhxvm7rno5/zuP1WYmpiG0bEHq4AcRUiG2165/WUzvxkW4XDYCscZWbTLZw==", "cpu": [ "x64" ], @@ -5118,9 +5093,9 @@ } }, "node_modules/@tailwindcss/oxide-wasm32-wasi": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-wasm32-wasi/-/oxide-wasm32-wasi-4.1.12.tgz", - "integrity": "sha512-4tSyu3dW+ktzdEpuk6g49KdEangu3eCYoqPhWNsZgUhyegEda3M9rG0/j1GV/JjVVsj+lG7jWAyrTlLzd/WEBg==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-wasm32-wasi/-/oxide-wasm32-wasi-4.1.16.tgz", + "integrity": "sha512-B01u/b8LteGRwucIBmCQ07FVXLzImWESAIMcUU6nvFt/tYsQ6IHz8DmZ5KtvmwxD+iTYBtM1xwoGXswnlu9v0Q==", "bundleDependencies": [ "@napi-rs/wasm-runtime", "@emnapi/core", @@ -5136,21 +5111,21 @@ "license": "MIT", "optional": true, "dependencies": { - "@emnapi/core": "^1.4.5", - "@emnapi/runtime": "^1.4.5", - "@emnapi/wasi-threads": "^1.0.4", - "@napi-rs/wasm-runtime": "^0.2.12", - "@tybys/wasm-util": "^0.10.0", - "tslib": "^2.8.0" + "@emnapi/core": "^1.5.0", + "@emnapi/runtime": "^1.5.0", + "@emnapi/wasi-threads": "^1.1.0", + "@napi-rs/wasm-runtime": "^1.0.7", + "@tybys/wasm-util": "^0.10.1", + "tslib": "^2.4.0" }, "engines": { "node": ">=14.0.0" } }, "node_modules/@tailwindcss/oxide-win32-arm64-msvc": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-win32-arm64-msvc/-/oxide-win32-arm64-msvc-4.1.12.tgz", - "integrity": "sha512-iGLyD/cVP724+FGtMWslhcFyg4xyYyM+5F4hGvKA7eifPkXHRAUDFaimu53fpNg9X8dfP75pXx/zFt/jlNF+lg==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-win32-arm64-msvc/-/oxide-win32-arm64-msvc-4.1.16.tgz", + "integrity": "sha512-zX+Q8sSkGj6HKRTMJXuPvOcP8XfYON24zJBRPlszcH1Np7xuHXhWn8qfFjIujVzvH3BHU+16jBXwgpl20i+v9A==", "cpu": [ "arm64" ], @@ -5165,9 +5140,9 @@ } }, "node_modules/@tailwindcss/oxide-win32-x64-msvc": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-win32-x64-msvc/-/oxide-win32-x64-msvc-4.1.12.tgz", - "integrity": "sha512-NKIh5rzw6CpEodv/++r0hGLlfgT/gFN+5WNdZtvh6wpU2BpGNgdjvj6H2oFc8nCM839QM1YOhjpgbAONUb4IxA==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-win32-x64-msvc/-/oxide-win32-x64-msvc-4.1.16.tgz", + "integrity": "sha512-m5dDFJUEejbFqP+UXVstd4W/wnxA4F61q8SoL+mqTypId2T2ZpuxosNSgowiCnLp2+Z+rivdU0AqpfgiD7yCBg==", "cpu": [ "x64" ], @@ -5182,51 +5157,51 @@ } }, "node_modules/@tailwindcss/postcss": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/postcss/-/postcss-4.1.12.tgz", - "integrity": "sha512-5PpLYhCAwf9SJEeIsSmCDLgyVfdBhdBpzX1OJ87anT9IVR0Z9pjM0FNixCAUAHGnMBGB8K99SwAheXrT0Kh6QQ==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/postcss/-/postcss-4.1.16.tgz", + "integrity": "sha512-Qn3SFGPXYQMKR/UtqS+dqvPrzEeBZHrFA92maT4zijCVggdsXnDBMsPFJo1eArX3J+O+Gi+8pV4PkqjLCNBk3A==", "dev": true, "license": "MIT", "dependencies": { "@alloc/quick-lru": "^5.2.0", - "@tailwindcss/node": "4.1.12", - "@tailwindcss/oxide": "4.1.12", + "@tailwindcss/node": "4.1.16", + "@tailwindcss/oxide": "4.1.16", "postcss": "^8.4.41", - "tailwindcss": "4.1.12" + "tailwindcss": "4.1.16" } }, "node_modules/@tailwindcss/vite": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@tailwindcss/vite/-/vite-4.1.12.tgz", - "integrity": "sha512-4pt0AMFDx7gzIrAOIYgYP0KCBuKWqyW8ayrdiLEjoJTT4pKTjrzG/e4uzWtTLDziC+66R9wbUqZBccJalSE5vQ==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/vite/-/vite-4.1.16.tgz", + "integrity": "sha512-bbguNBcDxsRmi9nnlWJxhfDWamY3lmcyACHcdO1crxfzuLpOhHLLtEIN/nCbbAtj5rchUgQD17QVAKi1f7IsKg==", "dev": true, "license": "MIT", "dependencies": { - "@tailwindcss/node": "4.1.12", - "@tailwindcss/oxide": "4.1.12", - "tailwindcss": "4.1.12" + "@tailwindcss/node": "4.1.16", + "@tailwindcss/oxide": "4.1.16", + "tailwindcss": "4.1.16" }, "peerDependencies": { "vite": "^5.2.0 || ^6 || ^7" } }, "node_modules/@trpc/client": { - "version": "11.4.4", - "resolved": "https://registry.npmjs.org/@trpc/client/-/client-11.4.4.tgz", - "integrity": "sha512-86OZl+Y+Xlt9ITGlhCMImERcsWCOrVzpNuzg3XBlsDSmSs9NGsghKjeCpJQlE36XaG3aze+o9pRukiYYvBqxgQ==", + "version": "11.7.1", + "resolved": "https://registry.npmjs.org/@trpc/client/-/client-11.7.1.tgz", + "integrity": "sha512-uOnAjElKI892/U6aQMcBHYs3x7mme3Cvv1F87ytBL56rBvs7+DyK7r43zgaXKf13+GtPEI6ex5xjVUfyDW8XcQ==", "funding": [ "https://trpc.io/sponsor" ], "license": "MIT", "peerDependencies": { - "@trpc/server": "11.4.4", + "@trpc/server": "11.7.1", "typescript": ">=5.7.2" } }, "node_modules/@trpc/server": { - "version": "11.4.4", - "resolved": "https://registry.npmjs.org/@trpc/server/-/server-11.4.4.tgz", - "integrity": "sha512-VkJb2xnb4rCynuwlCvgPBh5aM+Dco6fBBIo6lWAdJJRYVwtyE5bxNZBgUvRRz/cFSEAy0vmzLxF7aABDJfK5Rg==", + "version": "11.7.1", + "resolved": "https://registry.npmjs.org/@trpc/server/-/server-11.7.1.tgz", + "integrity": "sha512-N3U8LNLIP4g9C7LJ/sLkjuPHwqlvE3bnspzC4DEFVdvx2+usbn70P80E3wj5cjOTLhmhRiwJCSXhlB+MHfGeCw==", "funding": [ "https://trpc.io/sponsor" ], @@ -5392,9 +5367,9 @@ "license": "MIT" }, "node_modules/@types/semver": { - "version": "7.7.0", - "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.7.0.tgz", - "integrity": "sha512-k107IF4+Xr7UHjwDc7Cfd6PRQfbdkiRabXGRjo07b4WyPahFBZCZ1sE+BNxYIJPPg73UkfOsVOLwqVc/6ETrIA==", + "version": "7.7.1", + "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.7.1.tgz", + "integrity": "sha512-FmgJfu+MOcQ370SD0ev7EI8TlCAfKYU+B4m5T3yXc1CiRN94g/SZPtsCkk506aUDtlMnFZvasDwHHUcZUEaYuA==", "dev": true, "license": "MIT" }, @@ -5413,9 +5388,9 @@ "optional": true }, "node_modules/@types/turndown": { - "version": "5.0.5", - "resolved": "https://registry.npmjs.org/@types/turndown/-/turndown-5.0.5.tgz", - "integrity": "sha512-TL2IgGgc7B5j78rIccBtlYAnkuv8nUQqhQc+DSYV5j9Be9XOcm/SKOVRuA47xAVI3680Tk9B1d8flK2GWT2+4w==", + "version": "5.0.6", + "resolved": "https://registry.npmjs.org/@types/turndown/-/turndown-5.0.6.tgz", + "integrity": "sha512-ru00MoyeeouE5BX4gRL+6m/BsDfbRayOskWqUvh7CLGW+UXxHQItqALa38kKnOiZPqJrtzJUgAC2+F0rL1S4Pg==", "dev": true, "license": "MIT" }, @@ -6039,9 +6014,9 @@ } }, "node_modules/axios": { - "version": "1.11.0", - "resolved": "https://registry.npmjs.org/axios/-/axios-1.11.0.tgz", - "integrity": "sha512-1Lx3WLFQWm3ooKDYZD1eXmoGO9fxYQjrycfHFC8P0sCfQVXyROp0p9PFWBehewBOdCwHc+f/b8I0fMto5eSfwA==", + "version": "1.13.1", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.13.1.tgz", + "integrity": "sha512-hU4EGxxt+j7TQijx1oYdAjw4xuIp1wRQSsbMFwSthCWeBQur1eF+qJ5iQ5sN3Tw8YRzQNKb8jszgBdMDVqwJcw==", "license": "MIT", "dependencies": { "follow-redirects": "^1.15.6", @@ -6106,9 +6081,9 @@ "license": "Apache-2.0" }, "node_modules/better-sqlite3": { - "version": "12.2.0", - "resolved": "https://registry.npmjs.org/better-sqlite3/-/better-sqlite3-12.2.0.tgz", - "integrity": "sha512-eGbYq2CT+tos1fBwLQ/tkBt9J5M3JEHjku4hbvQUePCckkvVf14xWj+1m7dGoK81M/fOjFT7yM9UMeKT/+vFLQ==", + "version": "12.4.1", + "resolved": "https://registry.npmjs.org/better-sqlite3/-/better-sqlite3-12.4.1.tgz", + "integrity": "sha512-3yVdyZhklTiNrtg+4WqHpJpFDd+WHTg2oM7UcR80GqL05AOV0xEJzc6qNvFYoEtE+hRp1n9MpN6/+4yhlGkDXQ==", "hasInstallScript": true, "license": "MIT", "dependencies": { @@ -6527,16 +6502,6 @@ "url": "https://github.com/sponsors/fb55" } }, - "node_modules/chownr": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/chownr/-/chownr-3.0.0.tgz", - "integrity": "sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==", - "dev": true, - "license": "BlueOak-1.0.0", - "engines": { - "node": ">=18" - } - }, "node_modules/clean-stack": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/clean-stack/-/clean-stack-2.2.0.tgz", @@ -6828,42 +6793,34 @@ } }, "node_modules/cli-truncate": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/cli-truncate/-/cli-truncate-4.0.0.tgz", - "integrity": "sha512-nPdaFdQ0h/GEigbPClz11D0v/ZJEwxmeVZGeMo3Z5StPtUTkA9o1lD6QwoirYiSDzbcwn2XcjwmCp68W1IS4TA==", + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/cli-truncate/-/cli-truncate-5.1.1.tgz", + "integrity": "sha512-SroPvNHxUnk+vIW/dOSfNqdy1sPEFkrTk6TUtqLCnBlo3N7TNYYkzzN7uSD6+jVjrdO4+p8nH7JzH6cIvUem6A==", "dev": true, "license": "MIT", "dependencies": { - "slice-ansi": "^5.0.0", - "string-width": "^7.0.0" + "slice-ansi": "^7.1.0", + "string-width": "^8.0.0" }, "engines": { - "node": ">=18" + "node": ">=20" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/cli-truncate/node_modules/emoji-regex": { - "version": "10.4.0", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.4.0.tgz", - "integrity": "sha512-EC+0oUMY1Rqm4O6LLrgjtYDvcVYTy7chDnM4Q7030tP4Kwj3u/pR6gP9ygnp2CJMK5Gq+9Q2oqmrFJAz01DXjw==", - "dev": true, - "license": "MIT" - }, "node_modules/cli-truncate/node_modules/string-width": { - "version": "7.2.0", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-7.2.0.tgz", - "integrity": "sha512-tsaTIkKW9b4N+AEj+SVA+WhJzV7/zMhcSu78mLKWSk7cXMOSHsBKFWUs0fWwq8QyK3MgJBQRX6Gbi4kYbdvGkQ==", + "version": "8.1.0", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-8.1.0.tgz", + "integrity": "sha512-Kxl3KJGb/gxkaUMOjRsQ8IrXiGW75O4E3RPjFIINOVH8AMl2SQ/yWdTzWwF3FevIX9LcMAjJW+GRwAlAbTSXdg==", "dev": true, "license": "MIT", "dependencies": { - "emoji-regex": "^10.3.0", - "get-east-asian-width": "^1.0.0", + "get-east-asian-width": "^1.3.0", "strip-ansi": "^7.1.0" }, "engines": { - "node": ">=18" + "node": ">=20" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" @@ -7013,9 +6970,9 @@ } }, "node_modules/commander": { - "version": "14.0.0", - "resolved": "https://registry.npmjs.org/commander/-/commander-14.0.0.tgz", - "integrity": "sha512-2uM9rYjPvyq39NwLRqaiLtWHyDC1FvryJDa2ATTVims5YAS4PupsEQsDvP14FqhFr0P49CYDugi59xaxJlTXRA==", + "version": "14.0.2", + "resolved": "https://registry.npmjs.org/commander/-/commander-14.0.2.tgz", + "integrity": "sha512-TywoWNNRbhoD0BXs1P3ZEScW8W5iKrnbithIl0YH+uCmBd0QpPOA8yc82DS3BIE5Ma6FnBVUsJ7wVUDz4dvOWQ==", "license": "MIT", "engines": { "node": ">=20" @@ -7709,9 +7666,9 @@ } }, "node_modules/dompurify": { - "version": "3.2.6", - "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.2.6.tgz", - "integrity": "sha512-/2GogDQlohXPZe6D6NOgQvXLPSYBqIWMnZ8zzOhn09REE4eyAzb+Hed3jhoM9OkuaJ8P6ZGTTVWQKAi8ieIzfQ==", + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.3.0.tgz", + "integrity": "sha512-r+f6MYR1gGN1eJv0TVQbhA7if/U7P87cdPl3HN5rikqaBSBxLiCb/b9O+2eG0cxz0ghyU+mU1QkbsOwERMYlWQ==", "license": "(MPL-2.0 OR Apache-2.0)", "optionalDependencies": { "@types/trusted-types": "^2.0.7" @@ -7745,9 +7702,9 @@ } }, "node_modules/dotenv": { - "version": "17.2.1", - "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.2.1.tgz", - "integrity": "sha512-kQhDYKZecqnM0fCnzI5eIv5L4cAe/iRI+HqMbO/hbRdTAeXDG+M9FjipUxNfbARuEg4iHIbhnhs78BCHNbSxEQ==", + "version": "17.2.3", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.2.3.tgz", + "integrity": "sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w==", "license": "BSD-2-Clause", "engines": { "node": ">=12" @@ -8475,23 +8432,6 @@ "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==", "license": "MIT" }, - "node_modules/fast-glob": { - "version": "3.3.3", - "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.3.tgz", - "integrity": "sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg==", - "dev": true, - "license": "MIT", - "dependencies": { - "@nodelib/fs.stat": "^2.0.2", - "@nodelib/fs.walk": "^1.2.3", - "glob-parent": "^5.1.2", - "merge2": "^1.3.0", - "micromatch": "^4.0.8" - }, - "engines": { - "node": ">=8.6.0" - } - }, "node_modules/fast-json-stable-stringify": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz", @@ -8575,9 +8515,9 @@ } }, "node_modules/fastify": { - "version": "5.5.0", - "resolved": "https://registry.npmjs.org/fastify/-/fastify-5.5.0.tgz", - "integrity": "sha512-ZWSWlzj3K/DcULCnCjEiC2zn2FBPdlZsSA/pnPa/dbUfLvxkD/Nqmb0XXMXLrWkeM4uQPUvjdJpwtXmTfriXqw==", + "version": "5.6.1", + "resolved": "https://registry.npmjs.org/fastify/-/fastify-5.6.1.tgz", + "integrity": "sha512-WjjlOciBF0K8pDUPZoGPhqhKrQJ02I8DKaDIfO51EL0kbSMwQFl85cRwhOvmSDWoukNOdTo27gLN549pLCcH7Q==", "funding": [ { "type": "github", @@ -9104,9 +9044,9 @@ } }, "node_modules/generative-bayesian-network": { - "version": "2.1.70", - "resolved": "https://registry.npmjs.org/generative-bayesian-network/-/generative-bayesian-network-2.1.70.tgz", - "integrity": "sha512-nP0CNiVs/QS5ppMsGiEYN3dgAe3UTT1mpDth0wTh9uEyEO4e7y1Yr5PGDcTJsU0Lm3YM21yNzhuPbUg7etKHbQ==", + "version": "2.1.76", + "resolved": "https://registry.npmjs.org/generative-bayesian-network/-/generative-bayesian-network-2.1.76.tgz", + "integrity": "sha512-e9BByo5UEXPsrOii4RM94a02y1JXhP5XZKbzC5GWDz62Bbh2jWbrkY0ta2cF1rxrv8pqLu4c98yQC2F50Eqa7A==", "license": "Apache-2.0", "dependencies": { "adm-zip": "^0.5.9", @@ -9123,9 +9063,9 @@ } }, "node_modules/get-east-asian-width": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/get-east-asian-width/-/get-east-asian-width-1.3.0.tgz", - "integrity": "sha512-vpeMIQKxczTD/0s2CdEWHcb0eeJe6TFjxb+J5xgX7hScxqrGuyjmv4c1D4A/gelKfyox0gJJwIHF+fLjeaM8kQ==", + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/get-east-asian-width/-/get-east-asian-width-1.4.0.tgz", + "integrity": "sha512-QZjmEOC+IT1uk6Rx0sX22V6uHWVwbdbxf1faPqJ1QhLdGgsRGCZoyaQBm/piRdJy/D2um6hM1UP7ZEeQ4EkP+Q==", "dev": true, "license": "MIT", "engines": { @@ -9275,17 +9215,21 @@ "url": "https://github.com/sponsors/isaacs" } }, - "node_modules/glob-parent": { - "version": "5.1.2", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", - "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "node_modules/glob-to-regex.js": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/glob-to-regex.js/-/glob-to-regex.js-1.2.0.tgz", + "integrity": "sha512-QMwlOQKU/IzqMUOAZWubUOT8Qft+Y0KQWnX9nK3ch0CJg0tTp4TvGZsTfudYKv2NzoQSyPcnA6TYeIQ3jGichQ==", "dev": true, - "license": "ISC", - "dependencies": { - "is-glob": "^4.0.1" - }, + "license": "Apache-2.0", "engines": { - "node": ">= 6" + "node": ">=10.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/streamich" + }, + "peerDependencies": { + "tslib": "2" } }, "node_modules/global-directory": { @@ -9321,53 +9265,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/globby": { - "version": "14.1.0", - "resolved": "https://registry.npmjs.org/globby/-/globby-14.1.0.tgz", - "integrity": "sha512-0Ia46fDOaT7k4og1PDW4YbodWWr3scS2vAr2lTbsplOt2WkKp0vQbkI9wKis/T5LV/dqPjO3bpS/z6GTJB82LA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@sindresorhus/merge-streams": "^2.1.0", - "fast-glob": "^3.3.3", - "ignore": "^7.0.3", - "path-type": "^6.0.0", - "slash": "^5.1.0", - "unicorn-magic": "^0.3.0" - }, - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/globby/node_modules/path-type": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/path-type/-/path-type-6.0.0.tgz", - "integrity": "sha512-Vj7sf++t5pBD637NSfkxpHSMfWaeig5+DKWLhcqIYx6mWQz5hdJTGDVMQiJcw1ZYkhs7AazKDGpRVji1LJCZUQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/globby/node_modules/unicorn-magic": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/unicorn-magic/-/unicorn-magic-0.3.0.tgz", - "integrity": "sha512-+QBBXBCvifc56fsbuxZQ6Sic3wqqc3WWaqxs58gvJrcOuN83HGTCwz3oS5phzU9LthRNE9VrJCFCLUgHeeFnfA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/google-auth-library": { "version": "10.2.1", "resolved": "https://registry.npmjs.org/google-auth-library/-/google-auth-library-10.2.1.tgz", @@ -9601,13 +9498,13 @@ } }, "node_modules/header-generator": { - "version": "2.1.70", - "resolved": "https://registry.npmjs.org/header-generator/-/header-generator-2.1.70.tgz", - "integrity": "sha512-s2/jN4hIr/pDRZhXA1D2T72eO4f8Gi1mwYEIFLbU+OR7cjo+Tayrw4RlTN3dXPahrU/MBdjk9gv//MwxLoCpGQ==", + "version": "2.1.76", + "resolved": "https://registry.npmjs.org/header-generator/-/header-generator-2.1.76.tgz", + "integrity": "sha512-Lqk4zU/MIHkm29Sfle6E3Jo2gUoscoG9x12jDt1RbH3kRq/RN+NRSoRRYggmkI0GQSS0wiOIfWwjgIRrA9nHqA==", "license": "Apache-2.0", "dependencies": { "browserslist": "^4.21.1", - "generative-bayesian-network": "^2.1.70", + "generative-bayesian-network": "^2.1.76", "ow": "^0.28.1", "tslib": "^2.4.0" }, @@ -9626,13 +9523,13 @@ } }, "node_modules/hook-std": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/hook-std/-/hook-std-3.0.0.tgz", - "integrity": "sha512-jHRQzjSDzMtFy34AGj1DN+vq54WVuhSvKgrHf0OMiFQTwDD4L/qqofVEWjLOBMTn5+lCD3fPg32W9yOfnEJTTw==", + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/hook-std/-/hook-std-4.0.0.tgz", + "integrity": "sha512-IHI4bEVOt3vRUDJ+bFA9VUJlo7SzvFARPNLw75pqSmAOP2HmTWfFJtPvLBrDrlgjEYXY9zs7SFdHPQaJShkSCQ==", "dev": true, "license": "MIT", "engines": { - "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + "node": ">=20" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" @@ -9829,16 +9726,6 @@ ], "license": "BSD-3-Clause" }, - "node_modules/ignore": { - "version": "7.0.5", - "resolved": "https://registry.npmjs.org/ignore/-/ignore-7.0.5.tgz", - "integrity": "sha512-Hs59xBNfUIunMFgWAbGX5cq6893IbWg4KnrjbYwX3tx0ztorVgTDA6B2sxf8ejHJ4wz8BqGUMYlnzNBer5NvGg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 4" - } - }, "node_modules/import-fresh": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz", @@ -10122,16 +10009,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/is-finalizationregistry": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/is-finalizationregistry/-/is-finalizationregistry-1.1.1.tgz", @@ -10149,13 +10026,16 @@ } }, "node_modules/is-fullwidth-code-point": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-4.0.0.tgz", - "integrity": "sha512-O4L094N2/dZ7xqVdrXhh9r1KODPJpFms8B5sGdJLPy664AgvXsreZUyCQQNItZRDlYug4xStLjNp/sz3HvBowQ==", + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-5.1.0.tgz", + "integrity": "sha512-5XHYaSyiqADb4RnZ1Bdad6cPp8Toise4TzEjcOYDHZkTCbKgiUl7WTUCpNWHuxmDt91wnsZBc9xinNzopv3JMQ==", "dev": true, "license": "MIT", + "dependencies": { + "get-east-asian-width": "^1.3.1" + }, "engines": { - "node": ">=12" + "node": ">=18" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" @@ -10180,19 +10060,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/is-glob": { - "version": "4.0.3", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", - "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==", - "dev": true, - "license": "MIT", - "dependencies": { - "is-extglob": "^2.1.1" - }, - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/is-map": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/is-map/-/is-map-2.0.3.tgz", @@ -10225,6 +10092,13 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/is-node-process": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/is-node-process/-/is-node-process-1.2.0.tgz", + "integrity": "sha512-Vg4o6/fqPxIjtxgUH5QLJhwZ7gW5diGCVlXpuUfELC62CuxM1iHcRe51f2W1FDy04Ai4KJkagKjx3XaqyfRKXw==", + "dev": true, + "license": "MIT" + }, "node_modules/is-number": { "version": "7.0.0", "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", @@ -10537,9 +10411,9 @@ } }, "node_modules/jiti": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/jiti/-/jiti-2.5.1.tgz", - "integrity": "sha512-twQoecYPiVA5K/h6SxtORw/Bs3ar+mLUtoPSc7iMXzQzK8d7eJ/R09wmTwAjiamETn1cXYPGfNnu7DMoHgu12w==", + "version": "2.6.1", + "resolved": "https://registry.npmjs.org/jiti/-/jiti-2.6.1.tgz", + "integrity": "sha512-ekilCSN1jwRvIbgeg/57YFh8qQDNbwDb9xT/qu2DAHbFFZUicIl4ygVaAvzveMhMVr3LnpSKTNnwt8PoOfmKhQ==", "dev": true, "license": "MIT", "bin": { @@ -10554,9 +10428,9 @@ "license": "MIT" }, "node_modules/jose": { - "version": "6.0.12", - "resolved": "https://registry.npmjs.org/jose/-/jose-6.0.12.tgz", - "integrity": "sha512-T8xypXs8CpmiIi78k0E+Lk7T2zlK4zDyg+o1CZ4AkOHgDg98ogdP2BeZ61lTFKFyoEwJ9RgAgN+SdM3iPgNonQ==", + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/jose/-/jose-6.1.0.tgz", + "integrity": "sha512-TTQJyoEoKcC1lscpVDCSsVgYzUDg/0Bt3WE//WiTPK6uOCQC2KZS4MpugbMWt/zyjkopgZoXhZuCi00gLudfUA==", "license": "MIT", "funding": { "url": "https://github.com/sponsors/panva" @@ -10677,6 +10551,13 @@ "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", "license": "MIT" }, + "node_modules/json-stringify-safe": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz", + "integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==", + "dev": true, + "license": "ISC" + }, "node_modules/jsonfile": { "version": "6.2.0", "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.2.0.tgz", @@ -10990,9 +10871,9 @@ "license": "MIT" }, "node_modules/lightningcss": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss/-/lightningcss-1.30.1.tgz", - "integrity": "sha512-xi6IyHML+c9+Q3W0S4fCQJOym42pyurFiJUHEcEyHS0CeKzia4yZDEsLlqOFykxOdHpNy0NmvVO31vcSqAxJCg==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss/-/lightningcss-1.30.2.tgz", + "integrity": "sha512-utfs7Pr5uJyyvDETitgsaqSyjCb2qNRAtuqUeWIAKztsOYdcACf2KtARYXg2pSvhkt+9NfoaNY7fxjl6nuMjIQ==", "dev": true, "license": "MPL-2.0", "dependencies": { @@ -11006,22 +10887,44 @@ "url": "https://opencollective.com/parcel" }, "optionalDependencies": { - "lightningcss-darwin-arm64": "1.30.1", - "lightningcss-darwin-x64": "1.30.1", - "lightningcss-freebsd-x64": "1.30.1", - "lightningcss-linux-arm-gnueabihf": "1.30.1", - "lightningcss-linux-arm64-gnu": "1.30.1", - "lightningcss-linux-arm64-musl": "1.30.1", - "lightningcss-linux-x64-gnu": "1.30.1", - "lightningcss-linux-x64-musl": "1.30.1", - "lightningcss-win32-arm64-msvc": "1.30.1", - "lightningcss-win32-x64-msvc": "1.30.1" + "lightningcss-android-arm64": "1.30.2", + "lightningcss-darwin-arm64": "1.30.2", + "lightningcss-darwin-x64": "1.30.2", + "lightningcss-freebsd-x64": "1.30.2", + "lightningcss-linux-arm-gnueabihf": "1.30.2", + "lightningcss-linux-arm64-gnu": "1.30.2", + "lightningcss-linux-arm64-musl": "1.30.2", + "lightningcss-linux-x64-gnu": "1.30.2", + "lightningcss-linux-x64-musl": "1.30.2", + "lightningcss-win32-arm64-msvc": "1.30.2", + "lightningcss-win32-x64-msvc": "1.30.2" + } + }, + "node_modules/lightningcss-android-arm64": { + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-android-arm64/-/lightningcss-android-arm64-1.30.2.tgz", + "integrity": "sha512-BH9sEdOCahSgmkVhBLeU7Hc9DWeZ1Eb6wNS6Da8igvUwAe0sqROHddIlvU06q3WyXVEOYDZ6ykBZQnjTbmo4+A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MPL-2.0", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">= 12.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" } }, "node_modules/lightningcss-darwin-arm64": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-darwin-arm64/-/lightningcss-darwin-arm64-1.30.1.tgz", - "integrity": "sha512-c8JK7hyE65X1MHMN+Viq9n11RRC7hgin3HhYKhrMyaXflk5GVplZ60IxyoVtzILeKr+xAJwg6zK6sjTBJ0FKYQ==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-darwin-arm64/-/lightningcss-darwin-arm64-1.30.2.tgz", + "integrity": "sha512-ylTcDJBN3Hp21TdhRT5zBOIi73P6/W0qwvlFEk22fkdXchtNTOU4Qc37SkzV+EKYxLouZ6M4LG9NfZ1qkhhBWA==", "cpu": [ "arm64" ], @@ -11040,9 +10943,9 @@ } }, "node_modules/lightningcss-darwin-x64": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-darwin-x64/-/lightningcss-darwin-x64-1.30.1.tgz", - "integrity": "sha512-k1EvjakfumAQoTfcXUcHQZhSpLlkAuEkdMBsI/ivWw9hL+7FtilQc0Cy3hrx0AAQrVtQAbMI7YjCgYgvn37PzA==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-darwin-x64/-/lightningcss-darwin-x64-1.30.2.tgz", + "integrity": "sha512-oBZgKchomuDYxr7ilwLcyms6BCyLn0z8J0+ZZmfpjwg9fRVZIR5/GMXd7r9RH94iDhld3UmSjBM6nXWM2TfZTQ==", "cpu": [ "x64" ], @@ -11061,9 +10964,9 @@ } }, "node_modules/lightningcss-freebsd-x64": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-freebsd-x64/-/lightningcss-freebsd-x64-1.30.1.tgz", - "integrity": "sha512-kmW6UGCGg2PcyUE59K5r0kWfKPAVy4SltVeut+umLCFoJ53RdCUWxcRDzO1eTaxf/7Q2H7LTquFHPL5R+Gjyig==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-freebsd-x64/-/lightningcss-freebsd-x64-1.30.2.tgz", + "integrity": "sha512-c2bH6xTrf4BDpK8MoGG4Bd6zAMZDAXS569UxCAGcA7IKbHNMlhGQ89eRmvpIUGfKWNVdbhSbkQaWhEoMGmGslA==", "cpu": [ "x64" ], @@ -11082,9 +10985,9 @@ } }, "node_modules/lightningcss-linux-arm-gnueabihf": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-linux-arm-gnueabihf/-/lightningcss-linux-arm-gnueabihf-1.30.1.tgz", - "integrity": "sha512-MjxUShl1v8pit+6D/zSPq9S9dQ2NPFSQwGvxBCYaBYLPlCWuPh9/t1MRS8iUaR8i+a6w7aps+B4N0S1TYP/R+Q==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-linux-arm-gnueabihf/-/lightningcss-linux-arm-gnueabihf-1.30.2.tgz", + "integrity": "sha512-eVdpxh4wYcm0PofJIZVuYuLiqBIakQ9uFZmipf6LF/HRj5Bgm0eb3qL/mr1smyXIS1twwOxNWndd8z0E374hiA==", "cpu": [ "arm" ], @@ -11103,9 +11006,9 @@ } }, "node_modules/lightningcss-linux-arm64-gnu": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-linux-arm64-gnu/-/lightningcss-linux-arm64-gnu-1.30.1.tgz", - "integrity": "sha512-gB72maP8rmrKsnKYy8XUuXi/4OctJiuQjcuqWNlJQ6jZiWqtPvqFziskH3hnajfvKB27ynbVCucKSm2rkQp4Bw==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-linux-arm64-gnu/-/lightningcss-linux-arm64-gnu-1.30.2.tgz", + "integrity": "sha512-UK65WJAbwIJbiBFXpxrbTNArtfuznvxAJw4Q2ZGlU8kPeDIWEX1dg3rn2veBVUylA2Ezg89ktszWbaQnxD/e3A==", "cpu": [ "arm64" ], @@ -11124,9 +11027,9 @@ } }, "node_modules/lightningcss-linux-arm64-musl": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-linux-arm64-musl/-/lightningcss-linux-arm64-musl-1.30.1.tgz", - "integrity": "sha512-jmUQVx4331m6LIX+0wUhBbmMX7TCfjF5FoOH6SD1CttzuYlGNVpA7QnrmLxrsub43ClTINfGSYyHe2HWeLl5CQ==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-linux-arm64-musl/-/lightningcss-linux-arm64-musl-1.30.2.tgz", + "integrity": "sha512-5Vh9dGeblpTxWHpOx8iauV02popZDsCYMPIgiuw97OJ5uaDsL86cnqSFs5LZkG3ghHoX5isLgWzMs+eD1YzrnA==", "cpu": [ "arm64" ], @@ -11145,9 +11048,9 @@ } }, "node_modules/lightningcss-linux-x64-gnu": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-linux-x64-gnu/-/lightningcss-linux-x64-gnu-1.30.1.tgz", - "integrity": "sha512-piWx3z4wN8J8z3+O5kO74+yr6ze/dKmPnI7vLqfSqI8bccaTGY5xiSGVIJBDd5K5BHlvVLpUB3S2YCfelyJ1bw==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-linux-x64-gnu/-/lightningcss-linux-x64-gnu-1.30.2.tgz", + "integrity": "sha512-Cfd46gdmj1vQ+lR6VRTTadNHu6ALuw2pKR9lYq4FnhvgBc4zWY1EtZcAc6EffShbb1MFrIPfLDXD6Xprbnni4w==", "cpu": [ "x64" ], @@ -11166,9 +11069,9 @@ } }, "node_modules/lightningcss-linux-x64-musl": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-linux-x64-musl/-/lightningcss-linux-x64-musl-1.30.1.tgz", - "integrity": "sha512-rRomAK7eIkL+tHY0YPxbc5Dra2gXlI63HL+v1Pdi1a3sC+tJTcFrHX+E86sulgAXeI7rSzDYhPSeHHjqFhqfeQ==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-linux-x64-musl/-/lightningcss-linux-x64-musl-1.30.2.tgz", + "integrity": "sha512-XJaLUUFXb6/QG2lGIW6aIk6jKdtjtcffUT0NKvIqhSBY3hh9Ch+1LCeH80dR9q9LBjG3ewbDjnumefsLsP6aiA==", "cpu": [ "x64" ], @@ -11187,9 +11090,9 @@ } }, "node_modules/lightningcss-win32-arm64-msvc": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-win32-arm64-msvc/-/lightningcss-win32-arm64-msvc-1.30.1.tgz", - "integrity": "sha512-mSL4rqPi4iXq5YVqzSsJgMVFENoa4nGTT/GjO2c0Yl9OuQfPsIfncvLrEW6RbbB24WtZ3xP/2CCmI3tNkNV4oA==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-win32-arm64-msvc/-/lightningcss-win32-arm64-msvc-1.30.2.tgz", + "integrity": "sha512-FZn+vaj7zLv//D/192WFFVA0RgHawIcHqLX9xuWiQt7P0PtdFEVaxgF9rjM/IRYHQXNnk61/H/gb2Ei+kUQ4xQ==", "cpu": [ "arm64" ], @@ -11208,9 +11111,9 @@ } }, "node_modules/lightningcss-win32-x64-msvc": { - "version": "1.30.1", - "resolved": "https://registry.npmjs.org/lightningcss-win32-x64-msvc/-/lightningcss-win32-x64-msvc-1.30.1.tgz", - "integrity": "sha512-PVqXh48wh4T53F/1CCu8PIPCxLzWyCnn/9T5W1Jpmdy5h9Cwd+0YQS6/LwhHXSafuc61/xg9Lv5OrCby6a++jg==", + "version": "1.30.2", + "resolved": "https://registry.npmjs.org/lightningcss-win32-x64-msvc/-/lightningcss-win32-x64-msvc-1.30.2.tgz", + "integrity": "sha512-5g1yc73p+iAkid5phb4oVFMB45417DkRevRbt/El/gKXJk4jid+vPFF/AXbxn05Aky8PapwzZrdJShv5C0avjw==", "cpu": [ "x64" ], @@ -11228,19 +11131,6 @@ "url": "https://opencollective.com/parcel" } }, - "node_modules/lilconfig": { - "version": "3.1.3", - "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.3.tgz", - "integrity": "sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=14" - }, - "funding": { - "url": "https://github.com/sponsors/antonk52" - } - }, "node_modules/lines-and-columns": { "version": "1.2.4", "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", @@ -11249,19 +11139,16 @@ "license": "MIT" }, "node_modules/lint-staged": { - "version": "16.1.5", - "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-16.1.5.tgz", - "integrity": "sha512-uAeQQwByI6dfV7wpt/gVqg+jAPaSp8WwOA8kKC/dv1qw14oGpnpAisY65ibGHUGDUv0rYaZ8CAJZ/1U8hUvC2A==", + "version": "16.2.6", + "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-16.2.6.tgz", + "integrity": "sha512-s1gphtDbV4bmW1eylXpVMk2u7is7YsrLl8hzrtvC70h4ByhcMLZFY01Fx05ZUDNuv1H8HO4E+e2zgejV1jVwNw==", "dev": true, "license": "MIT", "dependencies": { - "chalk": "^5.5.0", - "commander": "^14.0.0", - "debug": "^4.4.1", - "lilconfig": "^3.1.3", - "listr2": "^9.0.1", + "commander": "^14.0.1", + "listr2": "^9.0.5", "micromatch": "^4.0.8", - "nano-spawn": "^1.0.2", + "nano-spawn": "^2.0.0", "pidtree": "^0.6.0", "string-argv": "^0.3.2", "yaml": "^2.8.1" @@ -11277,13 +11164,13 @@ } }, "node_modules/listr2": { - "version": "9.0.1", - "resolved": "https://registry.npmjs.org/listr2/-/listr2-9.0.1.tgz", - "integrity": "sha512-SL0JY3DaxylDuo/MecFeiC+7pedM0zia33zl0vcjgwcq1q1FWWF1To9EIauPbl8GbMCU0R2e0uJ8bZunhYKD2g==", + "version": "9.0.5", + "resolved": "https://registry.npmjs.org/listr2/-/listr2-9.0.5.tgz", + "integrity": "sha512-ME4Fb83LgEgwNw96RKNvKV4VTLuXfoKudAmm2lP8Kk87KaMK0/Xrx/aAkMWmT8mDb+3MlFDspfbCs7adjRxA2g==", "dev": true, "license": "MIT", "dependencies": { - "cli-truncate": "^4.0.0", + "cli-truncate": "^5.0.0", "colorette": "^2.0.20", "eventemitter3": "^5.0.1", "log-update": "^6.1.0", @@ -11295,9 +11182,9 @@ } }, "node_modules/listr2/node_modules/ansi-styles": { - "version": "6.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", - "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", + "version": "6.2.3", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.3.tgz", + "integrity": "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg==", "dev": true, "license": "MIT", "engines": { @@ -11308,9 +11195,9 @@ } }, "node_modules/listr2/node_modules/emoji-regex": { - "version": "10.4.0", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.4.0.tgz", - "integrity": "sha512-EC+0oUMY1Rqm4O6LLrgjtYDvcVYTy7chDnM4Q7030tP4Kwj3u/pR6gP9ygnp2CJMK5Gq+9Q2oqmrFJAz01DXjw==", + "version": "10.6.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.6.0.tgz", + "integrity": "sha512-toUI84YS5YmxW219erniWD0CIVOo46xGKColeNQRgOzDorgBi1v4D71/OFzgD9GO2UGKIv1C3Sp8DAn0+j5w7A==", "dev": true, "license": "MIT" }, @@ -11333,9 +11220,9 @@ } }, "node_modules/listr2/node_modules/wrap-ansi": { - "version": "9.0.0", - "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-9.0.0.tgz", - "integrity": "sha512-G8ura3S+3Z2G+mkgNRq8dqaFZAuxfsxpBB8OCTGRTCtp+l/v9nbFNmCUP1BZMts3G1142MsZfn6eeUKrr4PD1Q==", + "version": "9.0.2", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-9.0.2.tgz", + "integrity": "sha512-42AtmgqjV+X1VpdOfyTGOYRi0/zsoLqtXQckTmqTeybT+BDIbM/Guxo7x3pE2vtpr1ok6xRqM9OpBe+Jyoqyww==", "dev": true, "license": "MIT", "dependencies": { @@ -11554,9 +11441,9 @@ } }, "node_modules/log-update/node_modules/ansi-styles": { - "version": "6.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", - "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", + "version": "6.2.3", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.3.tgz", + "integrity": "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg==", "dev": true, "license": "MIT", "engines": { @@ -11567,45 +11454,12 @@ } }, "node_modules/log-update/node_modules/emoji-regex": { - "version": "10.4.0", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.4.0.tgz", - "integrity": "sha512-EC+0oUMY1Rqm4O6LLrgjtYDvcVYTy7chDnM4Q7030tP4Kwj3u/pR6gP9ygnp2CJMK5Gq+9Q2oqmrFJAz01DXjw==", + "version": "10.6.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.6.0.tgz", + "integrity": "sha512-toUI84YS5YmxW219erniWD0CIVOo46xGKColeNQRgOzDorgBi1v4D71/OFzgD9GO2UGKIv1C3Sp8DAn0+j5w7A==", "dev": true, "license": "MIT" }, - "node_modules/log-update/node_modules/is-fullwidth-code-point": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-5.0.0.tgz", - "integrity": "sha512-OVa3u9kkBbw7b8Xw5F9P+D/T9X+Z4+JruYVNapTjPYZYUznQ5YfWeFkOj606XYYW8yugTfC8Pj0hYqvi4ryAhA==", - "dev": true, - "license": "MIT", - "dependencies": { - "get-east-asian-width": "^1.0.0" - }, - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/log-update/node_modules/slice-ansi": { - "version": "7.1.0", - "resolved": "https://registry.npmjs.org/slice-ansi/-/slice-ansi-7.1.0.tgz", - "integrity": "sha512-bSiSngZ/jWeX93BqeIAbImyTbEihizcwNjFoRUIY/T1wWQsfsm2Vw1agPKylXvQTU7iASGdHhyqRlqQzfz+Htg==", - "dev": true, - "license": "MIT", - "dependencies": { - "ansi-styles": "^6.2.1", - "is-fullwidth-code-point": "^5.0.0" - }, - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/chalk/slice-ansi?sponsor=1" - } - }, "node_modules/log-update/node_modules/string-width": { "version": "7.2.0", "resolved": "https://registry.npmjs.org/string-width/-/string-width-7.2.0.tgz", @@ -11625,9 +11479,9 @@ } }, "node_modules/log-update/node_modules/wrap-ansi": { - "version": "9.0.0", - "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-9.0.0.tgz", - "integrity": "sha512-G8ura3S+3Z2G+mkgNRq8dqaFZAuxfsxpBB8OCTGRTCtp+l/v9nbFNmCUP1BZMts3G1142MsZfn6eeUKrr4PD1Q==", + "version": "9.0.2", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-9.0.2.tgz", + "integrity": "sha512-42AtmgqjV+X1VpdOfyTGOYRi0/zsoLqtXQckTmqTeybT+BDIbM/Guxo7x3pE2vtpr1ok6xRqM9OpBe+Jyoqyww==", "dev": true, "license": "MIT", "dependencies": { @@ -11666,13 +11520,13 @@ "license": "ISC" }, "node_modules/magic-string": { - "version": "0.30.17", - "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.17.tgz", - "integrity": "sha512-sNPKHvyjVf7gyjwS4xGTaW/mCnF8wnjtifKBEhxfZ7E/S8tQ0rssrwGNn6q8JH/ohItJfSQp9mBtQYuTlH5QnA==", + "version": "0.30.21", + "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.21.tgz", + "integrity": "sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==", "dev": true, "license": "MIT", "dependencies": { - "@jridgewell/sourcemap-codec": "^1.5.0" + "@jridgewell/sourcemap-codec": "^1.5.5" } }, "node_modules/markdown-table": { @@ -11949,21 +11803,19 @@ } }, "node_modules/memfs": { - "version": "4.36.3", - "resolved": "https://registry.npmjs.org/memfs/-/memfs-4.36.3.tgz", - "integrity": "sha512-rZIVsNPGdZDPls/ckWhIsod2zRNsI2f2kEru0gMldkrEve+fPn7CVBTvfKLNyHQ9rZDWwzVBF8tPsZivzDPiZQ==", + "version": "4.50.0", + "resolved": "https://registry.npmjs.org/memfs/-/memfs-4.50.0.tgz", + "integrity": "sha512-N0LUYQMUA1yS5tJKmMtU9yprPm6ZIg24yr/OVv/7t6q0kKDIho4cBbXRi1XKttUmNYDYgF/q45qrKE/UhGO0CA==", "dev": true, "license": "Apache-2.0", "dependencies": { "@jsonjoy.com/json-pack": "^1.11.0", "@jsonjoy.com/util": "^1.9.0", + "glob-to-regex.js": "^1.0.1", "thingies": "^2.5.0", "tree-dump": "^1.0.3", "tslib": "^2.0.0" }, - "engines": { - "node": ">= 4.0.0" - }, "funding": { "type": "github", "url": "https://github.com/sponsors/streamich" @@ -12010,16 +11862,6 @@ "dev": true, "license": "MIT" }, - "node_modules/merge2": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz", - "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 8" - } - }, "node_modules/micromark": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/micromark/-/micromark-4.0.2.tgz", @@ -12611,9 +12453,9 @@ } }, "node_modules/mime": { - "version": "4.0.7", - "resolved": "https://registry.npmjs.org/mime/-/mime-4.0.7.tgz", - "integrity": "sha512-2OfDPL+e03E0LrXaGYOtTFIYhiuzep94NSsuhrNULq+stylcJedcHdzHtz0atMUuGwJfFYs0YL5xeC/Ca2x0eQ==", + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/mime/-/mime-4.1.0.tgz", + "integrity": "sha512-X5ju04+cAzsojXKes0B/S4tcYtFAJ6tTMuSPBEn9CPGlrWr8Fiw7qYeLT0XyH80HSoAoqWCaz+MWKh22P7G1cw==", "funding": [ "https://github.com/sponsors/broofa" ], @@ -12723,35 +12565,6 @@ "node": ">=16 || 14 >=14.17" } }, - "node_modules/minizlib": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/minizlib/-/minizlib-3.0.2.tgz", - "integrity": "sha512-oG62iEk+CYt5Xj2YqI5Xi9xWUeZhDI8jjQmC5oThVH5JGCTgIjr7ciJDzC7MBzYd//WvR1OTmP5Q38Q8ShQtVA==", - "dev": true, - "license": "MIT", - "dependencies": { - "minipass": "^7.1.2" - }, - "engines": { - "node": ">= 18" - } - }, - "node_modules/mkdirp": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-3.0.1.tgz", - "integrity": "sha512-+NsyUUAZDmo6YVHzL/stxSu3t9YS1iljliy3BSDrXJ/dkn1KYdmtZODGGjLcc9XLgVVpH4KshHB8XmZgMhaBXg==", - "dev": true, - "license": "MIT", - "bin": { - "mkdirp": "dist/cjs/src/bin.js" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, "node_modules/mkdirp-classic": { "version": "0.5.3", "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", @@ -12826,9 +12639,9 @@ } }, "node_modules/nano-spawn": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/nano-spawn/-/nano-spawn-1.0.2.tgz", - "integrity": "sha512-21t+ozMQDAL/UGgQVBbZ/xXvNO10++ZPuTmKRO8k9V3AClVRht49ahtDjfY8l1q6nSHOrE5ASfthzH3ol6R/hg==", + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/nano-spawn/-/nano-spawn-2.0.0.tgz", + "integrity": "sha512-tacvGzUY5o2D8CBh2rrwxyNojUsZNU2zjNTzKQrkgGJQTbGAfArVWXSKMBokBeeg6C7OLRGUEyoFlYbfeWQIqw==", "dev": true, "license": "MIT", "engines": { @@ -12892,6 +12705,21 @@ "dev": true, "license": "MIT" }, + "node_modules/nock": { + "version": "14.0.10", + "resolved": "https://registry.npmjs.org/nock/-/nock-14.0.10.tgz", + "integrity": "sha512-Q7HjkpyPeLa0ZVZC5qpxBt5EyLczFJ91MEewQiIi9taWuA0KB/MDJlUWtON+7dGouVdADTQsf9RA7TZk6D8VMw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@mswjs/interceptors": "^0.39.5", + "json-stringify-safe": "^5.0.1", + "propagate": "^2.0.0" + }, + "engines": { + "node": ">=18.20.0 <20 || >=20.12.1" + } + }, "node_modules/node-abi": { "version": "3.75.0", "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.75.0.tgz", @@ -16102,6 +15930,13 @@ "integrity": "sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==", "license": "MIT" }, + "node_modules/outvariant": { + "version": "1.4.3", + "resolved": "https://registry.npmjs.org/outvariant/-/outvariant-1.4.3.tgz", + "integrity": "sha512-+Sl2UErvtsoajRDKCE5/dBz4DIvHXQQnAxtQTF04OJxY0+DyZXSo5P5Bb7XYWOh81syohlYL24hbDwxedPUJCA==", + "dev": true, + "license": "MIT" + }, "node_modules/ow": { "version": "0.28.2", "resolved": "https://registry.npmjs.org/ow/-/ow-0.28.2.tgz", @@ -16768,12 +16603,12 @@ "license": "MIT" }, "node_modules/posthog-node": { - "version": "5.7.0", - "resolved": "https://registry.npmjs.org/posthog-node/-/posthog-node-5.7.0.tgz", - "integrity": "sha512-6J1AIZWtbr2lEbZOO2AzO/h1FPJjUZM4KWcdaL2UQw7FY8J7VNaH3NiaRockASFmglpID7zEY25gV/YwCtuXjg==", + "version": "5.11.0", + "resolved": "https://registry.npmjs.org/posthog-node/-/posthog-node-5.11.0.tgz", + "integrity": "sha512-9+gmWp/7AEryJMi0+/ywJjKQhpkmcjxf+eT030fTIIPvFTF84zeeagdZBGNC/Nh2Jc0grIAW6O1n5lxXiX3daA==", "license": "MIT", "dependencies": { - "@posthog/core": "1.0.0" + "@posthog/core": "1.5.0" }, "engines": { "node": ">=20" @@ -16844,6 +16679,16 @@ ], "license": "MIT" }, + "node_modules/propagate": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/propagate/-/propagate-2.0.1.tgz", + "integrity": "sha512-vGrhOavPSTz4QVNuBNdcNXePNdNMaO1xj9yBeH1ScQPjk/rhg9sSlCXPhMkFuaNNW/syTvYqsnbIJxMBfRbbag==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, "node_modules/property-information": { "version": "7.1.0", "resolved": "https://registry.npmjs.org/property-information/-/property-information-7.1.0.tgz", @@ -16952,27 +16797,6 @@ ], "license": "MIT" }, - "node_modules/queue-microtask": { - "version": "1.2.3", - "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", - "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==", - "dev": true, - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "license": "MIT" - }, "node_modules/quick-format-unescaped": { "version": "4.0.4", "resolved": "https://registry.npmjs.org/quick-format-unescaped/-/quick-format-unescaped-4.0.4.tgz", @@ -17441,30 +17265,6 @@ "integrity": "sha512-guoltQEx+9aMf2gDZ0s62EcV8lsXR+0w8915TC3ITdn2YueuNjdAYh/levpU9nFaoChh9RUS5ZdQMrKfVEN9tw==", "license": "MIT" }, - "node_modules/run-parallel": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", - "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==", - "dev": true, - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "license": "MIT", - "dependencies": { - "queue-microtask": "^1.2.2" - } - }, "node_modules/safe-array-concat": { "version": "1.1.3", "resolved": "https://registry.npmjs.org/safe-array-concat/-/safe-array-concat-1.1.3.tgz", @@ -17617,9 +17417,9 @@ "license": "BSD-3-Clause" }, "node_modules/semantic-release": { - "version": "24.2.7", - "resolved": "https://registry.npmjs.org/semantic-release/-/semantic-release-24.2.7.tgz", - "integrity": "sha512-g7RssbTAbir1k/S7uSwSVZFfFXwpomUB9Oas0+xi9KStSCmeDXcA7rNhiskjLqvUe/Evhx8fVCT16OSa34eM5g==", + "version": "24.2.9", + "resolved": "https://registry.npmjs.org/semantic-release/-/semantic-release-24.2.9.tgz", + "integrity": "sha512-phCkJ6pjDi9ANdhuF5ElS10GGdAKY6R1Pvt9lT3SFhOwM4T7QZE7MLpBDbNruUx/Q3gFD92/UOFringGipRqZA==", "dev": true, "license": "MIT", "dependencies": { @@ -17637,7 +17437,7 @@ "find-versions": "^6.0.0", "get-stream": "^6.0.0", "git-log-parser": "^1.2.0", - "hook-std": "^3.0.0", + "hook-std": "^4.0.0", "hosted-git-info": "^8.0.0", "import-from-esm": "^2.0.0", "lodash-es": "^4.17.21", @@ -17649,7 +17449,7 @@ "read-package-up": "^11.0.0", "resolve-from": "^5.0.0", "semver": "^7.3.2", - "semver-diff": "^4.0.0", + "semver-diff": "^5.0.0", "signale": "^1.2.1", "yargs": "^17.5.1" }, @@ -17892,9 +17692,9 @@ } }, "node_modules/semver": { - "version": "7.7.2", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz", - "integrity": "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==", + "version": "7.7.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz", + "integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==", "license": "ISC", "bin": { "semver": "bin/semver.js" @@ -17904,9 +17704,10 @@ } }, "node_modules/semver-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/semver-diff/-/semver-diff-4.0.0.tgz", - "integrity": "sha512-0Ju4+6A8iOnpL/Thra7dZsSlOHYAHIeMxfhWQRI1/VLcT3WDBZKKtQt/QkBOsiIN9ZpuvHE6cGZ0x4glCMmfiA==", + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/semver-diff/-/semver-diff-5.0.0.tgz", + "integrity": "sha512-0HbGtOm+S7T6NGQ/pxJSJipJvc4DK3FcRVMRkhsIwJDJ4Jcz5DQC1cPPzB5GhzyHjwttW878HaWQq46CkL3cqg==", + "deprecated": "Deprecated as the semver package now supports this built-in.", "dev": true, "license": "MIT", "dependencies": { @@ -18320,40 +18121,27 @@ "node": ">=8" } }, - "node_modules/slash": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/slash/-/slash-5.1.0.tgz", - "integrity": "sha512-ZA6oR3T/pEyuqwMgAKT0/hAv8oAXckzbkmR0UkUosQ+Mc4RxGoJkRmwHgHufaenlyAgE1Mxgpdcrf75y6XcnDg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=14.16" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/slice-ansi": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/slice-ansi/-/slice-ansi-5.0.0.tgz", - "integrity": "sha512-FC+lgizVPfie0kkhqUScwRu1O/lF6NOgJmlCgK+/LYxDCTk8sGelYaHDhFcDN+Sn3Cv+3VSa4Byeo+IMCzpMgQ==", + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/slice-ansi/-/slice-ansi-7.1.2.tgz", + "integrity": "sha512-iOBWFgUX7caIZiuutICxVgX1SdxwAVFFKwt1EvMYYec/NWO5meOJ6K5uQxhrYBdQJne4KxiqZc+KptFOWFSI9w==", "dev": true, "license": "MIT", "dependencies": { - "ansi-styles": "^6.0.0", - "is-fullwidth-code-point": "^4.0.0" + "ansi-styles": "^6.2.1", + "is-fullwidth-code-point": "^5.0.0" }, "engines": { - "node": ">=12" + "node": ">=18" }, "funding": { "url": "https://github.com/chalk/slice-ansi?sponsor=1" } }, "node_modules/slice-ansi/node_modules/ansi-styles": { - "version": "6.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", - "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", + "version": "6.2.3", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.3.tgz", + "integrity": "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg==", "dev": true, "license": "MIT", "engines": { @@ -18586,6 +18374,13 @@ "readable-stream": "^2.0.2" } }, + "node_modules/strict-event-emitter": { + "version": "0.5.1", + "resolved": "https://registry.npmjs.org/strict-event-emitter/-/strict-event-emitter-0.5.1.tgz", + "integrity": "sha512-vMgjE/GGEPEFnhFub6pa4FmJBRBVOLpIII2hvCZ8Kzb7K0hlHo7mQv6xYrBvCL2LtAIBwFUK8wvuJgTVSQ5MFQ==", + "dev": true, + "license": "MIT" + }, "node_modules/string_decoder": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", @@ -18934,38 +18729,24 @@ "license": "MIT" }, "node_modules/tailwindcss": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.12.tgz", - "integrity": "sha512-DzFtxOi+7NsFf7DBtI3BJsynR+0Yp6etH+nRPTbpWnS2pZBaSksv/JGctNwSWzbFjp0vxSqknaUylseZqMDGrA==", + "version": "4.1.16", + "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.16.tgz", + "integrity": "sha512-pONL5awpaQX4LN5eiv7moSiSPd/DLDzKVRJz8Q9PgzmAdd1R4307GQS2ZpfiN7ZmekdQrfhZZiSE5jkLR4WNaA==", "dev": true, "license": "MIT" }, "node_modules/tapable": { - "version": "2.2.2", - "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.2.2.tgz", - "integrity": "sha512-Re10+NauLTMCudc7T5WLFLAwDhQ0JWdrMK+9B2M8zR5hRExKmsRDCBA7/aV/pNJFltmBFO5BAMlQFi/vq3nKOg==", + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.3.0.tgz", + "integrity": "sha512-g9ljZiwki/LfxmQADO3dEY1CbpmXT5Hm2fJ+QaGKwSXUylMybePR7/67YW7jOrrvjEgL1Fmz5kzyAjWVWLlucg==", "dev": true, "license": "MIT", "engines": { "node": ">=6" - } - }, - "node_modules/tar": { - "version": "7.4.3", - "resolved": "https://registry.npmjs.org/tar/-/tar-7.4.3.tgz", - "integrity": "sha512-5S7Va8hKfV7W5U6g3aYxXmlPoZVAwUMy9AOKyF2fVuZa2UD3qZjg578OrLRt8PcNN1PleVaL/5/yYATNL0ICUw==", - "dev": true, - "license": "ISC", - "dependencies": { - "@isaacs/fs-minipass": "^4.0.0", - "chownr": "^3.0.0", - "minipass": "^7.1.2", - "minizlib": "^3.0.1", - "mkdirp": "^3.0.1", - "yallist": "^5.0.0" }, - "engines": { - "node": ">=18" + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" } }, "node_modules/tar-fs": { @@ -19445,9 +19226,9 @@ } }, "node_modules/turndown": { - "version": "7.2.1", - "resolved": "https://registry.npmjs.org/turndown/-/turndown-7.2.1.tgz", - "integrity": "sha512-7YiPJw6rLClQL3oUKN3KgMaXeJJ2lAyZItclgKDurqnH61so4k4IH/qwmMva0zpuJc/FhRExBBnk7EbeFANlgQ==", + "version": "7.2.2", + "resolved": "https://registry.npmjs.org/turndown/-/turndown-7.2.2.tgz", + "integrity": "sha512-1F7db8BiExOKxjSMU2b7if62D/XOyQyZbPKq/nUwopfgnHlqXHqQ0lvfUTeUIr1lZJzOPFn43dODyMSIfvWRKQ==", "license": "MIT", "dependencies": { "@mixmark-io/domino": "^2.2.0" @@ -19559,9 +19340,9 @@ } }, "node_modules/typescript": { - "version": "5.9.2", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.2.tgz", - "integrity": "sha512-CWBzXQrc/qOkhidw1OzBTQuYRbfyxDXJMVJ1XNwUHGROVmuaeiEm3OslpZ1RV96d7SKKjZKrSJu3+t/xlw3R9A==", + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", "license": "Apache-2.0", "bin": { "tsc": "bin/tsc", @@ -20503,16 +20284,6 @@ "node": ">=10" } }, - "node_modules/yallist": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz", - "integrity": "sha512-YgvUTfwqyc7UXVMrB+SImsVYSmTS8X/tSrtdNZMImM+n7+QTriRXyXim0mBrTXNeqzVF0KWGgHPeiyViFFrNDw==", - "dev": true, - "license": "BlueOak-1.0.0", - "engines": { - "node": ">=18" - } - }, "node_modules/yaml": { "version": "2.8.1", "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.8.1.tgz", @@ -20629,9 +20400,9 @@ } }, "node_modules/zod": { - "version": "4.0.17", - "resolved": "https://registry.npmjs.org/zod/-/zod-4.0.17.tgz", - "integrity": "sha512-1PHjlYRevNxxdy2JZ8JcNAw7rX8V9P1AKkP+x/xZfxB0K5FYfuV+Ug6P/6NVSR2jHQ+FzDDoDHS04nYUsOIyLQ==", + "version": "4.1.12", + "resolved": "https://registry.npmjs.org/zod/-/zod-4.1.12.tgz", + "integrity": "sha512-JInaHOamG8pt5+Ey8kGmdcAcg3OL9reK8ltczgHTAwNhMys/6ThXHityHxVV2p3fkw/c+MAvBHFVYHFZDmjMCQ==", "license": "MIT", "funding": { "url": "https://github.com/sponsors/colinhacks" diff --git a/package.json b/package.json index 6610d537..8c3fb58d 100644 --- a/package.json +++ b/package.json @@ -36,12 +36,13 @@ "test:e2e:watch": "vitest --config test/vitest.config.ts", "lint": "biome check .", "lint:fix": "biome check . --fix", + "typecheck": "npx tsc --noEmit --project tsconfig.test.json", "format": "biome format . --write", "postinstall": "echo 'Skipping Playwright browser install. See README.md for details.'" }, "dependencies": { "@fastify/formbody": "^8.0.2", - "@fastify/static": "^8.2.0", + "@fastify/static": "^8.3.0", "@joplin/turndown-plugin-gfm": "^1.0.62", "@kitajs/html": "^4.2.9", "@kitajs/ts-html-plugin": "^4.1.1", @@ -49,54 +50,54 @@ "@langchain/google-genai": "^0.2.16", "@langchain/google-vertexai": "^0.2.16", "@langchain/openai": "^0.6.3", - "@modelcontextprotocol/sdk": "^1.17.1", - "@trpc/client": "^11.4.4", + "@modelcontextprotocol/sdk": "^1.20.2", + "@trpc/client": "^11.7.1", "@trpc/server": "^11.4.4", "alpinejs": "^3.14.9", - "axios": "^1.11.0", + "axios": "^1.13.1", "axios-retry": "^4.5.0", - "better-sqlite3": "^12.2.0", + "better-sqlite3": "^12.4.1", "cheerio": "^1.1.2", "commander": "^14.0.0", - "dompurify": "^3.2.6", - "dotenv": "^17.2.1", + "dompurify": "^3.3.0", + "dotenv": "^17.2.3", "env-paths": "^3.0.0", - "fastify": "^5.4.0", + "fastify": "^5.6.1", "flowbite": "^3.1.2", "fuse.js": "^7.1.0", - "header-generator": "^2.1.69", + "header-generator": "^2.1.76", "htmx.org": "^2.0.6", "iconv-lite": "^0.6.3", - "jose": "^6.0.12", + "jose": "^6.1.0", "jsdom": "^26.1.0", "langchain": "^0.3.30", - "mime": "^4.0.7", + "mime": "^4.1.0", "minimatch": "^10.0.1", "playwright": "^1.52.0", - "posthog-node": "^5.7.0", + "posthog-node": "^5.11.0", "psl": "^1.15.0", "remark": "^15.0.1", "remark-gfm": "^4.0.1", "remark-html": "^16.0.1", - "semver": "^7.7.2", + "semver": "^7.7.3", "sqlite-vec": "^0.1.7-alpha.2", "tree-sitter": "^0.21.1", "tree-sitter-javascript": "^0.23.1", "tree-sitter-python": "^0.21.0", "tree-sitter-typescript": "^0.23.2", - "turndown": "^7.2.0", - "zod": "^4.0.14" + "turndown": "^7.2.2", + "zod": "^4.1.12" }, "devDependencies": { - "@biomejs/biome": "^2.1.3", + "@biomejs/biome": "^2.3.2", "@commitlint/cli": "^19.8.1", "@commitlint/config-conventional": "^19.8.1", "@semantic-release/changelog": "^6.0.3", "@semantic-release/git": "^10.0.1", - "@semantic-release/github": "^11.0.3", + "@semantic-release/github": "^11.0.6", "@semantic-release/npm": "^12.0.2", - "@tailwindcss/postcss": "^4.1.11", - "@tailwindcss/vite": "^4.1.11", + "@tailwindcss/postcss": "^4.1.16", + "@tailwindcss/vite": "^4.1.16", "@types/alpinejs": "^3.13.11", "@types/better-sqlite3": "^7.6.13", "@types/jsdom": "~21.1.7", @@ -104,18 +105,19 @@ "@types/node": "^24.1.0", "@types/node-fetch": "^2.6.13", "@types/psl": "^1.1.3", - "@types/semver": "^7.7.0", - "@types/turndown": "^5.0.5", + "@types/semver": "^7.7.1", + "@types/turndown": "^5.0.6", "autoprefixer": "^10.4.21", "flowbite-typography": "^1.0.5", "husky": "^9.1.7", - "lint-staged": "^16.1.2", - "memfs": "^4.34.0", + "lint-staged": "^16.2.6", + "memfs": "^4.50.0", + "nock": "^14.0.10", "npm-run-all": "^4.1.5", "postcss": "^8.5.6", - "semantic-release": "^24.2.7", + "semantic-release": "^24.2.9", "tailwindcss": "^4.1.4", - "typescript": "^5.9.2", + "typescript": "^5.9.3", "vite": "^6.3.5", "vite-node": "^3.1.2", "vite-plugin-dts": "^4.5.4", diff --git a/src/cli/commands/refresh.ts b/src/cli/commands/refresh.ts new file mode 100644 index 00000000..e8ad34a4 --- /dev/null +++ b/src/cli/commands/refresh.ts @@ -0,0 +1,110 @@ +/** + * Refresh command - Re-scrapes an existing library version using ETags to skip unchanged pages. + */ + +import type { Command } from "commander"; +import { Option } from "commander"; +import type { PipelineOptions } from "../../pipeline"; +import type { IPipeline } from "../../pipeline/trpc/interfaces"; +import { createDocumentManagement } from "../../store"; +import type { IDocumentManagement } from "../../store/trpc/interfaces"; +import { analytics, TelemetryEvent } from "../../telemetry"; +import { RefreshVersionTool } from "../../tools/RefreshVersionTool"; +import { + createPipelineWithCallbacks, + getGlobalOptions, + resolveEmbeddingContext, +} from "../utils"; + +export async function refreshAction( + library: string, + options: { + version?: string; + embeddingModel?: string; + serverUrl?: string; + }, + command?: Command, +) { + await analytics.track(TelemetryEvent.CLI_COMMAND, { + command: "refresh", + library, + version: options.version, + useServerUrl: !!options.serverUrl, + }); + + const serverUrl = options.serverUrl; + const globalOptions = getGlobalOptions(command); + + // Resolve embedding configuration for local execution (refresh needs embeddings) + const embeddingConfig = resolveEmbeddingContext(options.embeddingModel); + if (!serverUrl && !embeddingConfig) { + throw new Error( + "Embedding configuration is required for local refresh operations. " + + "Please set DOCS_MCP_EMBEDDING_MODEL environment variable or use --server-url for remote execution.", + ); + } + + const docService: IDocumentManagement = await createDocumentManagement({ + serverUrl, + embeddingConfig, + storePath: globalOptions.storePath, + }); + let pipeline: IPipeline | null = null; + + try { + const pipelineOptions: PipelineOptions = { + recoverJobs: false, + concurrency: 1, + serverUrl, + }; + + pipeline = await createPipelineWithCallbacks( + serverUrl ? undefined : (docService as unknown as never), + pipelineOptions, + ); + await pipeline.start(); + const refreshTool = new RefreshVersionTool(pipeline); + + // Call the tool directly - tracking is now handled inside the tool + const result = await refreshTool.execute({ + library, + version: options.version, + waitForCompletion: true, // Always wait for completion in CLI + }); + + if ("pagesRefreshed" in result) { + console.log(`✅ Successfully refreshed ${result.pagesRefreshed} pages`); + } else { + console.log(`🚀 Refresh job started with ID: ${result.jobId}`); + } + } finally { + if (pipeline) await pipeline.stop(); + await docService.shutdown(); + } +} + +export function createRefreshCommand(program: Command): Command { + return program + .command("refresh ") + .description( + "Re-scrape an existing library version, updating only changed pages.\n\n" + + "Uses HTTP ETags to efficiently skip unchanged pages and only re-process\n" + + "content that has been modified or deleted since the last scrape.\n\n" + + "Examples:\n" + + " refresh react --version 18.0.0\n" + + " refresh mylib\n" + + "\nNote: The library and version must already be indexed. Use 'scrape' to index a new library/version.", + ) + .option("-v, --version ", "Version of the library (optional)") + .addOption( + new Option( + "--embedding-model ", + "Embedding model configuration (e.g., 'openai:text-embedding-3-small')", + ).env("DOCS_MCP_EMBEDDING_MODEL"), + ) + .option( + "--server-url ", + "URL of external pipeline worker RPC (e.g., http://localhost:6280/api)", + ) + .action(refreshAction); +} diff --git a/src/cli/index.test.ts b/src/cli/index.test.ts index bfa0f97a..c375bddf 100644 --- a/src/cli/index.test.ts +++ b/src/cli/index.test.ts @@ -238,6 +238,7 @@ describe("CLI Command Arguments Matrix", () => { "web", "worker", "scrape", + "refresh", "search", "list", "find-version", diff --git a/src/cli/index.ts b/src/cli/index.ts index 90c9158c..8acfea81 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -16,6 +16,7 @@ import { createFetchUrlCommand } from "./commands/fetchUrl"; import { createFindVersionCommand } from "./commands/findVersion"; import { createListCommand } from "./commands/list"; import { createMcpCommand } from "./commands/mcp"; +import { createRefreshCommand } from "./commands/refresh"; import { createRemoveCommand } from "./commands/remove"; import { createScrapeCommand } from "./commands/scrape"; import { createSearchCommand } from "./commands/search"; @@ -132,6 +133,7 @@ export function createCliProgram(): Command { createWebCommand(program); createWorkerCommand(program); createScrapeCommand(program); + createRefreshCommand(program); createSearchCommand(program); createListCommand(program); createFindVersionCommand(program); diff --git a/src/cli/utils.ts b/src/cli/utils.ts index 0fd819bc..1c3a8316 100644 --- a/src/cli/utils.ts +++ b/src/cli/utils.ts @@ -173,7 +173,7 @@ export async function createPipelineWithCallbacks( ): Promise { logger.debug(`Initializing pipeline with options: ${JSON.stringify(options)}`); const { serverUrl, ...rest } = options; - const pipeline = serverUrl + const pipeline: IPipeline = serverUrl ? await PipelineFactory.createPipeline(undefined, { serverUrl, ...rest }) : await (async () => { if (!docService) { @@ -194,7 +194,7 @@ export async function createPipelineWithCallbacks( }, onJobError: async (job, error, document) => { logger.warn( - `⚠️ Job ${job.id} error ${document ? `on document ${document.metadata.url}` : ""}: ${error.message}`, + `⚠️ Job ${job.id} error ${document ? `on document ${document.url}` : ""}: ${error.message}`, ); }, }); diff --git a/src/mcp/mcpServer.test.ts b/src/mcp/mcpServer.test.ts index fdc16113..c9bbd877 100644 --- a/src/mcp/mcpServer.test.ts +++ b/src/mcp/mcpServer.test.ts @@ -24,6 +24,9 @@ const mockTools: McpServerTools = { scrape: { execute: vi.fn(async () => ({ jobId: "job-123" })), } as any, + refresh: { + execute: vi.fn(async () => ({ jobId: "refresh-job-123" })), + } as any, listJobs: { execute: vi.fn(async () => ({ jobs: [] })), } as any, diff --git a/src/mcp/mcpServer.ts b/src/mcp/mcpServer.ts index 8510078c..b6fff20f 100644 --- a/src/mcp/mcpServer.ts +++ b/src/mcp/mcpServer.ts @@ -115,6 +115,56 @@ export function createMcpServerInstance( } }, ); + + // Refresh version tool - suppress deep inference issues + server.tool( + "refresh_version", + "Re-scrape a previously indexed library version, updating only changed pages.", + { + library: z.string().trim().describe("Library name."), + version: z + .string() + .trim() + .optional() + .describe("Library version (optional, refreshes unversioned if omitted)."), + }, + { + title: "Refresh Library Version", + destructiveHint: false, // Only updates changed content + openWorldHint: true, // requires internet access + }, + async ({ library, version }) => { + // Track MCP tool usage + analytics.track(TelemetryEvent.TOOL_USED, { + tool: "refresh_version", + context: "mcp_server", + library, + version, + }); + + try { + // Execute refresh tool without waiting + const result = await tools.refresh.execute({ + library, + version, + waitForCompletion: false, // Don't wait for completion + }); + + // Check the type of result + if ("jobId" in result) { + // If we got a jobId back, report that + return createResponse(`🔄 Refresh job started with ID: ${result.jobId}.`); + } + // This case shouldn't happen if waitForCompletion is false, but handle defensively + return createResponse( + `Refresh finished immediately (unexpectedly) with ${result.pagesRefreshed} pages.`, + ); + } catch (error) { + // Handle errors during job enqueueing or initial setup + return createError(error); + } + }, + ); } // Search docs tool diff --git a/src/mcp/tools.ts b/src/mcp/tools.ts index 04a51bc7..5057b8fe 100644 --- a/src/mcp/tools.ts +++ b/src/mcp/tools.ts @@ -8,6 +8,7 @@ import { GetJobInfoTool, ListJobsTool, ListLibrariesTool, + RefreshVersionTool, RemoveTool, ScrapeTool, SearchTool, @@ -20,6 +21,7 @@ export interface McpServerTools { listLibraries: ListLibrariesTool; findVersion: FindVersionTool; scrape: ScrapeTool; + refresh: RefreshVersionTool; search: SearchTool; listJobs: ListJobsTool; getJobInfo: GetJobInfoTool; @@ -43,6 +45,7 @@ export async function initializeTools( listLibraries: new ListLibrariesTool(docService), findVersion: new FindVersionTool(docService), scrape: new ScrapeTool(pipeline), + refresh: new RefreshVersionTool(pipeline), search: new SearchTool(docService), listJobs: new ListJobsTool(pipeline), getJobInfo: new GetJobInfoTool(pipeline), diff --git a/src/pipeline/PipelineClient.test.ts b/src/pipeline/PipelineClient.test.ts index 13c88f9b..668ffda2 100644 --- a/src/pipeline/PipelineClient.test.ts +++ b/src/pipeline/PipelineClient.test.ts @@ -6,7 +6,8 @@ vi.mock("../utils/logger"); // Mock tRPC client factory const mockClient: any = { ping: { query: vi.fn() }, - enqueueJob: { mutate: vi.fn() }, + enqueueScrapeJob: { mutate: vi.fn() }, + enqueueRefreshJob: { mutate: vi.fn() }, getJob: { query: vi.fn() }, getJobs: { query: vi.fn() }, cancelJob: { mutate: vi.fn() }, @@ -28,7 +29,8 @@ describe("PipelineClient", () => { vi.resetAllMocks(); // Reset default mock behaviors mockClient.ping.query.mockResolvedValue({ status: "ok" }); - mockClient.enqueueJob.mutate.mockResolvedValue({ jobId: "job-123" }); + mockClient.enqueueScrapeJob.mutate.mockResolvedValue({ jobId: "job-123" }); + mockClient.enqueueRefreshJob.mutate.mockResolvedValue({ jobId: "job-456" }); mockClient.getJob.query.mockResolvedValue(undefined); mockClient.getJobs.query.mockResolvedValue({ jobs: [] }); mockClient.cancelJob.mutate.mockResolvedValue({ success: true }); @@ -50,18 +52,18 @@ describe("PipelineClient", () => { }); }); - describe("enqueueJob", () => { + describe("enqueueScrapeJob", () => { it("should delegate job creation to external API", async () => { const mockJobId = "job-123"; - mockClient.enqueueJob.mutate.mockResolvedValueOnce({ jobId: mockJobId }); - const jobId = await client.enqueueJob("react", "18.0.0", { + mockClient.enqueueScrapeJob.mutate.mockResolvedValueOnce({ jobId: mockJobId }); + const jobId = await client.enqueueScrapeJob("react", "18.0.0", { url: "https://react.dev", library: "react", version: "18.0.0", }); expect(jobId).toBe(mockJobId); - expect(mockClient.enqueueJob.mutate).toHaveBeenCalledWith({ + expect(mockClient.enqueueScrapeJob.mutate).toHaveBeenCalledWith({ library: "react", version: "18.0.0", options: { @@ -73,9 +75,9 @@ describe("PipelineClient", () => { }); it("should handle API errors gracefully", async () => { - mockClient.enqueueJob.mutate.mockRejectedValueOnce(new Error("Bad request")); + mockClient.enqueueScrapeJob.mutate.mockRejectedValueOnce(new Error("Bad request")); - await expect(client.enqueueJob("invalid", null, {} as any)).rejects.toThrow( + await expect(client.enqueueScrapeJob("invalid", null, {} as any)).rejects.toThrow( "Failed to enqueue job: Bad request", ); }); diff --git a/src/pipeline/PipelineClient.ts b/src/pipeline/PipelineClient.ts index 85532d9d..c680c189 100644 --- a/src/pipeline/PipelineClient.ts +++ b/src/pipeline/PipelineClient.ts @@ -68,7 +68,7 @@ export class PipelineClient implements IPipeline { logger.debug("PipelineClient stopped"); } - async enqueueJob( + async enqueueScrapeJob( library: string, version: string | undefined | null, options: ScraperOptions, @@ -78,7 +78,7 @@ export class PipelineClient implements IPipeline { typeof version === "string" && version.trim().length === 0 ? null : (version ?? null); - const result = await this.client.enqueueJob.mutate({ + const result = await this.client.enqueueScrapeJob.mutate({ library, version: normalizedVersion, options, @@ -92,6 +92,28 @@ export class PipelineClient implements IPipeline { } } + async enqueueRefreshJob( + library: string, + version: string | undefined | null, + ): Promise { + try { + const normalizedVersion = + typeof version === "string" && version.trim().length === 0 + ? null + : (version ?? null); + const result = await this.client.enqueueRefreshJob.mutate({ + library, + version: normalizedVersion, + }); + logger.debug(`Refresh job ${result.jobId} enqueued successfully`); + return result.jobId; + } catch (error) { + throw new Error( + `Failed to enqueue refresh job: ${error instanceof Error ? error.message : String(error)}`, + ); + } + } + async getJob(jobId: string): Promise { try { const serializedJob = await this.client.getJob.query({ id: jobId }); diff --git a/src/pipeline/PipelineFactory.ts b/src/pipeline/PipelineFactory.ts index e2e9aac9..19959b4b 100644 --- a/src/pipeline/PipelineFactory.ts +++ b/src/pipeline/PipelineFactory.ts @@ -24,8 +24,7 @@ export namespace PipelineFactory { // Overload: Remote pipeline client (out-of-process worker) export async function createPipeline( docService: undefined, - options: Required> & - Omit, + options: PipelineOptions & { serverUrl: string }, ): Promise; // Implementation export async function createPipeline( diff --git a/src/pipeline/PipelineManager.test.ts b/src/pipeline/PipelineManager.test.ts index 37fa75bb..25849a9c 100644 --- a/src/pipeline/PipelineManager.test.ts +++ b/src/pipeline/PipelineManager.test.ts @@ -15,7 +15,7 @@ vi.mock("uuid", () => { }); import { afterEach, beforeEach, describe, expect, it, type Mock, vi } from "vitest"; -import type { ScraperProgress } from "../scraper/types"; +import type { ScraperProgressEvent } from "../scraper/types"; import type { DocumentManagementService } from "../store/DocumentManagementService"; import { ListJobsTool } from "../tools/ListJobsTool"; import { PipelineManager } from "./PipelineManager"; @@ -67,7 +67,11 @@ describe("PipelineManager", () => { progress: null, error: null, sourceUrl: "https://example.com", - scraperOptions: null, + scraperOptions: { + url: "https://example.com", + library: "test-lib", + version: "1.0.0", + }, abortController: new AbortController(), completionPromise: Promise.resolve(), resolveCompletion: () => {}, @@ -79,13 +83,22 @@ describe("PipelineManager", () => { const createTestProgress = ( pagesScraped: number, totalPages: number, - ): ScraperProgress => ({ + ): ScraperProgressEvent => ({ pagesScraped, totalPages, currentUrl: `https://example.com/page-${pagesScraped}`, depth: 1, maxDepth: 3, totalDiscovered: 0, + result: { + url: `https://example.com/page-${pagesScraped}`, + title: `Page ${pagesScraped}`, + contentType: "text/html", + textContent: "", + links: [], + errors: [], + chunks: [], + }, }); beforeEach(() => { @@ -98,6 +111,10 @@ describe("PipelineManager", () => { updateVersionStatus: vi.fn().mockResolvedValue(undefined), updateVersionProgress: vi.fn().mockResolvedValue(undefined), // For progress tests getVersionsByStatus: vi.fn().mockResolvedValue([]), + // Refresh job methods + ensureVersion: vi.fn().mockResolvedValue(1), + getPagesByVersionId: vi.fn().mockResolvedValue([]), + getScraperOptions: vi.fn().mockResolvedValue(null), }; // Mock the worker's executeJob method @@ -128,7 +145,7 @@ describe("PipelineManager", () => { // --- Enqueueing Tests --- it("should enqueue a job with QUEUED status and return a job ID", async () => { const options = { url: "http://a.com", library: "libA", version: "1.0" }; - const jobId = await manager.enqueueJob("libA", "1.0", options); + const jobId = await manager.enqueueScrapeJob("libA", "1.0", options); const job = await manager.getJob(jobId); expect(job?.status).toBe(PipelineJobStatus.QUEUED); expect(job?.library).toBe("libA"); @@ -149,7 +166,7 @@ describe("PipelineManager", () => { maxPages: 1, maxDepth: 1, }; - const jobId = await manager.enqueueJob("libA", "1.0", options); + const jobId = await manager.enqueueScrapeJob("libA", "1.0", options); await manager.start(); await vi.advanceTimersByTimeAsync(1); const job = await manager.getJob(jobId); @@ -160,7 +177,7 @@ describe("PipelineManager", () => { it("should complete a job and transition to COMPLETED", async () => { const options = { url: "http://a.com", library: "libA", version: "1.0" }; - const jobId = await manager.enqueueJob("libA", "1.0", options); + const jobId = await manager.enqueueScrapeJob("libA", "1.0", options); await manager.start(); await vi.advanceTimersByTimeAsync(1); await manager.waitForJobCompletion(jobId); @@ -189,7 +206,7 @@ describe("PipelineManager", () => { }), ); } - const jobId1 = await manager.enqueueJob( + const jobId1 = await manager.enqueueScrapeJob( "libA", desc === "unversioned" ? undefined : "1.0", options1, @@ -204,7 +221,7 @@ describe("PipelineManager", () => { library: "libA", version: desc === "unversioned" ? "" : "1.0", }; - const jobId2 = await manager.enqueueJob( + const jobId2 = await manager.enqueueScrapeJob( "libA", desc === "unversioned" ? undefined : "1.0", options2, @@ -228,7 +245,7 @@ describe("PipelineManager", () => { it("should transition job to FAILED if worker throws", async () => { mockWorkerInstance.executeJob.mockRejectedValue(new Error("fail")); const options = { url: "http://fail.com", library: "libFail", version: "1.0" }; - const jobId = await manager.enqueueJob("libFail", "1.0", options); + const jobId = await manager.enqueueScrapeJob("libFail", "1.0", options); await manager.start(); await vi.advanceTimersByTimeAsync(1); await manager.waitForJobCompletion(jobId).catch(() => {}); // Handle expected rejection @@ -245,7 +262,7 @@ describe("PipelineManager", () => { }), ); const options = { url: "http://cancel.com", library: "libCancel", version: "1.0" }; - const jobId = await manager.enqueueJob("libCancel", "1.0", options); + const jobId = await manager.enqueueScrapeJob("libCancel", "1.0", options); await manager.start(); await vi.advanceTimersByTimeAsync(1); await manager.cancelJob(jobId); @@ -272,7 +289,7 @@ describe("PipelineManager", () => { library: "libProgress", version: "1.0", }; - const jobId = await manager.enqueueJob("libProgress", "1.0", options); + const jobId = await manager.enqueueScrapeJob("libProgress", "1.0", options); await manager.start(); await vi.advanceTimersByTimeAsync(1); await manager.waitForJobCompletion(jobId); @@ -286,8 +303,8 @@ describe("PipelineManager", () => { const optionsB = { url: "http://b.com", library: "libB", version: "1.0" }; const pendingPromise = new Promise(() => {}); mockWorkerInstance.executeJob.mockReturnValue(pendingPromise); - const jobIdA = await manager.enqueueJob("libA", "1.0", optionsA); - const jobIdB = await manager.enqueueJob("libB", "1.0", optionsB); + const jobIdA = await manager.enqueueScrapeJob("libA", "1.0", optionsA); + const jobIdB = await manager.enqueueScrapeJob("libB", "1.0", optionsB); await manager.start(); await vi.advanceTimersByTimeAsync(1); const jobA = await manager.getJob(jobIdA); @@ -398,7 +415,7 @@ describe("PipelineManager", () => { describe("Database Status Integration", () => { it("should update database status when job is enqueued", async () => { const options = { url: "http://example.com", library: "test-lib", version: "1.0" }; - await manager.enqueueJob("test-lib", "1.0", options); + await manager.enqueueScrapeJob("test-lib", "1.0", options); // Should ensure library/version exists and update status to QUEUED expect(mockStore.ensureLibraryAndVersion).toHaveBeenCalledWith("test-lib", "1.0"); @@ -407,7 +424,7 @@ describe("PipelineManager", () => { it("should handle unversioned jobs correctly", async () => { const options = { url: "http://example.com", library: "test-lib", version: "" }; - await manager.enqueueJob("test-lib", null, options); + await manager.enqueueScrapeJob("test-lib", null, options); // Should treat null version as empty string expect(mockStore.ensureLibraryAndVersion).toHaveBeenCalledWith("test-lib", ""); @@ -476,7 +493,7 @@ describe("PipelineManager", () => { it("should map job statuses to database statuses correctly", async () => { // Test that the mapping function works correctly by checking enum values const options = { url: "http://example.com", library: "test-lib", version: "1.0" }; - const jobId = await manager.enqueueJob("test-lib", "1.0", options); + const jobId = await manager.enqueueScrapeJob("test-lib", "1.0", options); // Verify the job was created with correct status const job = await manager.getJob(jobId); @@ -495,7 +512,9 @@ describe("PipelineManager", () => { const options = { url: "http://example.com", library: "test-lib", version: "1.0" }; // Should not throw even if database update fails - await expect(manager.enqueueJob("test-lib", "1.0", options)).resolves.toBeDefined(); + await expect( + manager.enqueueScrapeJob("test-lib", "1.0", options), + ).resolves.toBeDefined(); // Job should still be created in memory despite database error const allJobs = await manager.getJobs(); @@ -549,7 +568,7 @@ describe("PipelineManager", () => { // This should not cause the system to hang try { - const jobId = await manager.enqueueJob("test-lib", "1.0", options); + const jobId = await manager.enqueueScrapeJob("test-lib", "1.0", options); // If it succeeds, verify the job exists if (jobId) { const job = await manager.getJob(jobId); @@ -582,4 +601,134 @@ describe("PipelineManager", () => { expect(cleanupSpy).toHaveBeenCalledTimes(1); }); }); + + // --- Refresh Job Tests --- + describe("enqueueRefreshJob", () => { + it("should successfully enqueue a refresh job with initial queue", async () => { + // Setup: Mock pages and scraper options for an existing version + const mockPages = [ + { id: 1, url: "https://example.com/page1", depth: 0, etag: "etag1" }, + { id: 2, url: "https://example.com/page2", depth: 1, etag: "etag2" }, + { id: 3, url: "https://example.com/page3", depth: 1, etag: "etag3" }, + ]; + + (mockStore.ensureVersion as Mock).mockResolvedValue(456); + (mockStore.getPagesByVersionId as Mock).mockResolvedValue(mockPages); + (mockStore.getScraperOptions as Mock).mockResolvedValue({ + sourceUrl: "https://example.com", + options: { maxDepth: 2 }, + }); + + // Action: Enqueue a refresh job + const jobId = await manager.enqueueRefreshJob("test-lib", "1.0.0"); + + // Assertions: Verify the job was created with correct properties + expect(jobId).toBeDefined(); + expect(typeof jobId).toBe("string"); + + const job = await manager.getJob(jobId); + expect(job).toBeDefined(); + expect(job?.status).toBe(PipelineJobStatus.QUEUED); + expect(job?.library).toBe("test-lib"); + expect(job?.version).toBe("1.0.0"); + + // Verify the scraper options contain an initialQueue with the same number of pages + // Note: initialQueue is part of ScraperOptions but not VersionScraperOptions (storage type) + expect(job?.scraperOptions).toBeDefined(); + const scraperOpts = job?.scraperOptions as any; + expect(scraperOpts?.initialQueue).toBeDefined(); + expect(scraperOpts?.initialQueue).toHaveLength(mockPages.length); + + // Verify maxPages is NOT set (allowing discovery of new pages during refresh) + expect(scraperOpts?.maxPages).toBeUndefined(); + }); + + it("should handle unversioned libraries during refresh", async () => { + const mockPages = [ + { id: 1, url: "https://example.com/page1", depth: 0, etag: "etag1" }, + ]; + + (mockStore.ensureVersion as Mock).mockResolvedValue(789); + (mockStore.getPagesByVersionId as Mock).mockResolvedValue(mockPages); + (mockStore.getScraperOptions as Mock).mockResolvedValue({ + sourceUrl: "https://example.com", + options: {}, + }); + + // Action: Enqueue refresh for unversioned library (null/undefined version) + const jobId = await manager.enqueueRefreshJob("unversioned-lib", null); + + // Assertions + const job = await manager.getJob(jobId); + expect(job).toBeDefined(); + expect(job?.library).toBe("unversioned-lib"); + expect(job?.version).toBe(null); // Public API uses null for unversioned + const scraperOpts = job?.scraperOptions as any; + expect(scraperOpts?.initialQueue).toHaveLength(1); + }); + + it("should throw error when refreshing a version with no pages", async () => { + // Setup: Mock empty pages array + (mockStore.ensureVersion as Mock).mockResolvedValue(999); + (mockStore.getPagesByVersionId as Mock).mockResolvedValue([]); + + // Action & Assertion: Should throw with clear error message + await expect(manager.enqueueRefreshJob("empty-lib", "1.0.0")).rejects.toThrow( + "No pages found for empty-lib@1.0.0", + ); + }); + + it("should throw error when refreshing unversioned library with no pages", async () => { + // Setup: Mock empty pages array for unversioned library + (mockStore.ensureVersion as Mock).mockResolvedValue(888); + (mockStore.getPagesByVersionId as Mock).mockResolvedValue([]); + + // Action & Assertion: Should throw with clear error message including "unversioned" + await expect(manager.enqueueRefreshJob("empty-lib", undefined)).rejects.toThrow( + "No pages found for empty-lib@unversioned", + ); + }); + + it("should preserve page depth and etag in initialQueue", async () => { + const mockPages = [ + { id: 10, url: "https://example.com/deep", depth: 5, etag: "deep-etag" }, + { id: 11, url: "https://example.com/shallow", depth: 0, etag: null }, + ]; + + (mockStore.ensureVersion as Mock).mockResolvedValue(111); + (mockStore.getPagesByVersionId as Mock).mockResolvedValue(mockPages); + (mockStore.getScraperOptions as Mock).mockResolvedValue({ + sourceUrl: "https://example.com", + options: {}, + }); + + const jobId = await manager.enqueueRefreshJob("depth-test", "1.0.0"); + const job = await manager.getJob(jobId); + + // Verify initialQueue contains depth and etag information + // Note: initialQueue is part of ScraperOptions but not VersionScraperOptions (storage type) + const scraperOpts = job?.scraperOptions as any; + const queue = scraperOpts?.initialQueue; + expect(queue).toBeDefined(); + expect(queue).toHaveLength(2); + + // Verify deep page + const deepItem = queue?.find( + (item: any) => item.url === "https://example.com/deep", + ); + expect(deepItem).toBeDefined(); + expect(deepItem?.depth).toBe(5); + expect(deepItem?.etag).toBe("deep-etag"); + expect(deepItem?.pageId).toBe(10); + + // Verify shallow page + const shallowItem = queue?.find( + (item: any) => item.url === "https://example.com/shallow", + ); + expect(shallowItem).toBeDefined(); + expect(shallowItem?.depth).toBe(0); + expect(shallowItem?.etag).toBe(null); + expect(shallowItem?.pageId).toBe(11); + }); + }); }); diff --git a/src/pipeline/PipelineManager.ts b/src/pipeline/PipelineManager.ts index 213261bb..8dc9a7fd 100644 --- a/src/pipeline/PipelineManager.ts +++ b/src/pipeline/PipelineManager.ts @@ -9,7 +9,7 @@ import { v4 as uuidv4 } from "uuid"; import { ScraperRegistry, ScraperService } from "../scraper"; -import type { ScraperOptions, ScraperProgress } from "../scraper/types"; +import type { ScraperOptions, ScraperProgressEvent } from "../scraper/types"; import type { DocumentManagementService } from "../store"; import { VersionStatus } from "../store/types"; import { DEFAULT_MAX_CONCURRENCY } from "../utils/config"; @@ -235,7 +235,7 @@ export class PipelineManager implements IPipeline { /** * Enqueues a new document processing job, aborting any existing QUEUED/RUNNING job for the same library+version (including unversioned). */ - async enqueueJob( + async enqueueScrapeJob( library: string, version: string | undefined | null, options: ScraperOptions, @@ -243,15 +243,6 @@ export class PipelineManager implements IPipeline { // Normalize version: treat undefined/null as "" (unversioned) const normalizedVersion = version ?? ""; - // Extract URL and convert ScraperOptions to VersionScraperOptions - const { - url, - library: _library, - version: _version, - signal: _signal, - ...versionOptions - } = options; - // Abort any existing QUEUED or RUNNING job for the same library+version const allJobs = await this.getJobs(); const duplicateJobs = allJobs.filter( @@ -299,8 +290,8 @@ export class PipelineManager implements IPipeline { progressMaxPages: 0, errorMessage: null, updatedAt: new Date(), - sourceUrl: url, - scraperOptions: versionOptions, + sourceUrl: options.url, + scraperOptions: options, }; this.jobMap.set(jobId, job); @@ -322,6 +313,77 @@ export class PipelineManager implements IPipeline { return jobId; } + /** + * Enqueues a refresh job for an existing library version by re-scraping all pages + * and using ETag comparison to skip unchanged content. + */ + async enqueueRefreshJob( + library: string, + version: string | undefined | null, + ): Promise { + // Normalize version: treat undefined/null as "" (unversioned) + const normalizedVersion = version ?? ""; + + try { + // First, check if the library version exists + const versionId = await this.store.ensureVersion({ + library, + version: normalizedVersion, + }); + + // Get all pages for this version with their ETags and depths + const pages = await this.store.getPagesByVersionId(versionId); + + // Debug: Log first page to see what data we're getting + if (pages.length > 0) { + logger.debug( + `Sample page data: url=${pages[0].url}, etag=${pages[0].etag}, depth=${pages[0].depth}`, + ); + } + + if (pages.length === 0) { + throw new Error( + `No pages found for ${library}@${normalizedVersion || "unversioned"}. Use scrape_docs to index it first.`, + ); + } + + logger.info( + `🔄 Preparing refresh job for ${library}@${normalizedVersion || "unversioned"} with ${pages.length} page(s)`, + ); + + // Build initialQueue from pages with original depth values + const initialQueue = pages.map((page) => ({ + url: page.url, + depth: page.depth ?? 0, // Use original depth, fallback to 0 for old data + pageId: page.id, + etag: page.etag, + })); + + // Get stored scraper options to retrieve the source URL and other options + const storedOptions = await this.store.getScraperOptions(versionId); + + // Build scraper options with initialQueue and isRefresh flag + const scraperOptions = { + url: storedOptions?.sourceUrl || pages[0].url, // Required but not used when initialQueue is set + library, + version: normalizedVersion, + ...(storedOptions?.options || {}), // Include stored options if available (spread first) + // Override with refresh-specific options (these must come after the spread) + initialQueue, // Pre-populated queue with existing pages + isRefresh: true, // Mark this as a refresh operation + }; + + // Enqueue as a standard scrape job with the initialQueue + logger.info( + `📝 Enqueueing refresh job for ${library}@${normalizedVersion || "unversioned"}`, + ); + return this.enqueueScrapeJob(library, normalizedVersion, scraperOptions); + } catch (error) { + logger.error(`❌ Failed to enqueue refresh job: ${error}`); + throw error; + } + } + /** * Enqueues a job using stored scraper options from a previous indexing run. * If no stored options are found, throws an error. @@ -360,7 +422,7 @@ export class PipelineManager implements IPipeline { `🔄 Re-indexing ${library}@${normalizedVersion || "unversioned"} with stored options from ${stored.sourceUrl}`, ); - return this.enqueueJob(library, normalizedVersion, completeOptions); + return this.enqueueScrapeJob(library, normalizedVersion, completeOptions); } catch (error) { logger.error(`❌ Failed to enqueue job with stored options: ${error}`); throw error; @@ -649,14 +711,8 @@ export class PipelineManager implements IPipeline { // Store scraper options when job is first queued if (newStatus === PipelineJobStatus.QUEUED && job.scraperOptions) { try { - // Reconstruct ScraperOptions for storage (DocumentStore will filter runtime fields) - const fullOptions = { - url: job.sourceUrl ?? "", - library: job.library, - version: job.version, - ...job.scraperOptions, - }; - await this.store.storeScraperOptions(versionId, fullOptions); + // Pass the complete scraper options (DocumentStore will filter runtime fields) + await this.store.storeScraperOptions(versionId, job.scraperOptions); logger.debug( `Stored scraper options for ${job.library}@${job.version}: ${job.sourceUrl}`, ); @@ -681,7 +737,7 @@ export class PipelineManager implements IPipeline { */ async updateJobProgress( job: InternalPipelineJob, - progress: ScraperProgress, + progress: ScraperProgressEvent, ): Promise { // Update in-memory progress job.progress = progress; diff --git a/src/pipeline/PipelineWorker.test.ts b/src/pipeline/PipelineWorker.test.ts index fa94fe6d..d1b05ac1 100644 --- a/src/pipeline/PipelineWorker.test.ts +++ b/src/pipeline/PipelineWorker.test.ts @@ -1,8 +1,7 @@ import { beforeEach, describe, expect, it, type Mock, vi } from "vitest"; import type { ScraperService } from "../scraper"; -import type { ScraperProgress } from "../scraper/types"; +import type { ScrapeResult, ScraperProgressEvent } from "../scraper/types"; import type { DocumentManagementService } from "../store/DocumentManagementService"; -import type { Document } from "../types"; import { PipelineWorker } from "./PipelineWorker"; import type { InternalPipelineJob, PipelineManagerCallbacks } from "./types"; import { PipelineJobStatus } from "./types"; @@ -24,8 +23,9 @@ describe("PipelineWorker", () => { vi.resetAllMocks(); mockStore = { - addDocument: vi.fn().mockResolvedValue(undefined), + addScrapeResult: vi.fn().mockResolvedValue(undefined), removeAllDocuments: vi.fn().mockResolvedValue(undefined), + deletePage: vi.fn().mockResolvedValue(undefined), }; mockScraperService = { @@ -65,53 +65,56 @@ describe("PipelineWorker", () => { rejectCompletion: vi.fn(), sourceUrl: "http://example.com", scraperOptions: { + url: "http://example.com", + library: "test-lib", + version: "1.0.0", maxPages: 10, maxDepth: 1, }, }; }); - it("should execute job successfully, calling scrape, addDocument, and onJobProgress", async () => { - const mockDoc1: Document = { - content: "doc1", - metadata: { - url: "url1", - title: "Doc 1", - library: mockJob.library, // Add required field - version: mockJob.version, // Add required field - }, + it("should execute job successfully, calling scrape, addScrapeResult, and onJobProgress", async () => { + const mockProcessed1: ScrapeResult = { + textContent: "doc1", + url: "url1", + title: "Doc 1", + contentType: "text/html", + chunks: [], + links: [], + errors: [], }; - const mockDoc2: Document = { - content: "doc2", - metadata: { - url: "url2", - title: "Doc 2", - library: mockJob.library, // Add required field - version: mockJob.version, // Add required field - }, + const mockProcessed2: ScrapeResult = { + textContent: "doc2", + url: "url2", + title: "Doc 2", + contentType: "text/html", + chunks: [], + links: [], + errors: [], }; // Configure mock scrape to yield progress (mockScraperService.scrape as Mock).mockImplementation( async (_options, progressCallback, _signal) => { - const progress1: ScraperProgress = { + const progress1: ScraperProgressEvent = { pagesScraped: 1, totalPages: 2, currentUrl: "url1", depth: 1, maxDepth: 1, - document: mockDoc1, + result: mockProcessed1, totalDiscovered: 0, }; await progressCallback(progress1); - const progress2: ScraperProgress = { + const progress2: ScraperProgressEvent = { pagesScraped: 2, totalPages: 2, currentUrl: "url2", depth: 1, maxDepth: 1, - document: mockDoc2, + result: mockProcessed2, totalDiscovered: 0, }; await progressCallback(progress2); @@ -127,39 +130,38 @@ describe("PipelineWorker", () => { mockJob.version, ); - // Verify scrape was called + // Verify scrape was called with the complete scraper options expect(mockScraperService.scrape).toHaveBeenCalledOnce(); expect(mockScraperService.scrape).toHaveBeenCalledWith( - { - url: mockJob.sourceUrl, - library: mockJob.library, - version: mockJob.version, - ...mockJob.scraperOptions, - }, + mockJob.scraperOptions, // Now passes the complete options directly expect.any(Function), // The progress callback abortController.signal, ); - // Verify addDocument was called for each document - expect(mockStore.addDocument).toHaveBeenCalledTimes(2); - expect(mockStore.addDocument).toHaveBeenCalledWith(mockJob.library, mockJob.version, { - pageContent: mockDoc1.content, - metadata: mockDoc1.metadata, - }); - expect(mockStore.addDocument).toHaveBeenCalledWith(mockJob.library, mockJob.version, { - pageContent: mockDoc2.content, - metadata: mockDoc2.metadata, - }); + // Verify addScrapeResult was called for each document + expect(mockStore.addScrapeResult).toHaveBeenCalledTimes(2); + expect(mockStore.addScrapeResult).toHaveBeenCalledWith( + mockJob.library, + mockJob.version, + 1, + mockProcessed1, + ); + expect(mockStore.addScrapeResult).toHaveBeenCalledWith( + mockJob.library, + mockJob.version, + 1, + mockProcessed2, + ); // Verify onJobProgress was called expect(mockCallbacks.onJobProgress).toHaveBeenCalledTimes(2); expect(mockCallbacks.onJobProgress).toHaveBeenCalledWith( mockJob, - expect.objectContaining({ document: mockDoc1 }), + expect.objectContaining({ result: mockProcessed1 }), ); expect(mockCallbacks.onJobProgress).toHaveBeenCalledWith( mockJob, - expect.objectContaining({ document: mockDoc2 }), + expect.objectContaining({ result: mockProcessed2 }), ); // Verify job progress object was NOT updated directly by worker @@ -178,67 +180,81 @@ describe("PipelineWorker", () => { // Verify dependencies were called appropriately expect(mockScraperService.scrape).toHaveBeenCalledOnce(); - expect(mockStore.addDocument).not.toHaveBeenCalled(); + expect(mockStore.addScrapeResult).not.toHaveBeenCalled(); expect(mockCallbacks.onJobProgress).not.toHaveBeenCalled(); expect(mockCallbacks.onJobError).not.toHaveBeenCalled(); }); - it("should call onJobError and continue if store.addDocument fails", async () => { - const mockDoc: Document = { - content: "doc1", - metadata: { url: "url1", title: "Doc 1", library: "test-lib", version: "1.0.0" }, + it("should call onJobError and continue if store.addScrapeResult fails", async () => { + const mockProcessed: ScrapeResult = { + textContent: "doc1", + url: "url1", + title: "Doc 1", + contentType: "text/html", + chunks: [], + links: [], + errors: [], }; const storeError = new Error("Database error"); // Simulate scrape yielding one document (mockScraperService.scrape as Mock).mockImplementation( async (_options, progressCallback, _signal) => { - const progress: ScraperProgress = { + const progress: ScraperProgressEvent = { pagesScraped: 1, totalPages: 1, currentUrl: "url1", depth: 1, maxDepth: 1, - document: mockDoc, + result: mockProcessed, totalDiscovered: 0, }; await progressCallback(progress); }, ); - // Simulate addDocument failing - (mockStore.addDocument as Mock).mockRejectedValue(storeError); + // Simulate addScrapeResult failing + (mockStore.addScrapeResult as Mock).mockRejectedValue(storeError); // Execute the job - should complete despite the error await expect(worker.executeJob(mockJob, mockCallbacks)).resolves.toBeUndefined(); // Verify scrape was called expect(mockScraperService.scrape).toHaveBeenCalledOnce(); - // Verify addDocument was called - expect(mockStore.addDocument).toHaveBeenCalledOnce(); + // Verify addScrapeResult was called + expect(mockStore.addScrapeResult).toHaveBeenCalledOnce(); // Verify onJobProgress was called expect(mockCallbacks.onJobProgress).toHaveBeenCalledOnce(); - // Verify onJobError was called + // Verify onJobError was called with the page that failed expect(mockCallbacks.onJobError).toHaveBeenCalledOnce(); - expect(mockCallbacks.onJobError).toHaveBeenCalledWith(mockJob, storeError, mockDoc); + expect(mockCallbacks.onJobError).toHaveBeenCalledWith( + mockJob, + storeError, + mockProcessed, + ); }); it("should throw CancellationError if cancelled during scrape progress", async () => { - const mockDoc: Document = { - content: "doc1", - metadata: { url: "url1", title: "Doc 1", library: "test-lib", version: "1.0.0" }, + const mockProcessed: ScrapeResult = { + textContent: "doc1", + url: "url1", + title: "Doc 1", + contentType: "text/html", + chunks: [], + links: [], + errors: [], }; // Simulate scrape checking signal and throwing (mockScraperService.scrape as Mock).mockImplementation( async (_options, progressCallback, _signal) => { - const progress: ScraperProgress = { + const progress: ScraperProgressEvent = { pagesScraped: 1, totalPages: 2, currentUrl: "url1", depth: 1, maxDepth: 1, - document: mockDoc, + result: mockProcessed, totalDiscovered: 0, }; // Simulate cancellation happening *before* progress is processed by worker @@ -259,8 +275,8 @@ describe("PipelineWorker", () => { // Verify scrape was called expect(mockScraperService.scrape).toHaveBeenCalledOnce(); - // Verify addDocument was NOT called - expect(mockStore.addDocument).not.toHaveBeenCalled(); + // Verify addScrapeResult was NOT called + expect(mockStore.addScrapeResult).not.toHaveBeenCalled(); // Verify onJobProgress was NOT called because cancellation check happens first expect(mockCallbacks.onJobProgress).not.toHaveBeenCalled(); // Verify onJobError was NOT called @@ -289,8 +305,227 @@ describe("PipelineWorker", () => { // Verify scrape was called (now only once) expect(mockScraperService.scrape).toHaveBeenCalledOnce(); // Verify other callbacks not called - expect(mockStore.addDocument).not.toHaveBeenCalled(); + expect(mockStore.addScrapeResult).not.toHaveBeenCalled(); expect(mockCallbacks.onJobProgress).not.toHaveBeenCalled(); expect(mockCallbacks.onJobError).not.toHaveBeenCalled(); }); + + it("should fail the job if document deletion fails during refresh", async () => { + const deletionError = new Error("Database deletion failed"); + + // Simulate scrape yielding a deletion event (404 page) + (mockScraperService.scrape as Mock).mockImplementation( + async (_options, progressCallback, _signal) => { + const progress: ScraperProgressEvent = { + pagesScraped: 1, + totalPages: 1, + currentUrl: "url1", + depth: 1, + maxDepth: 1, + deleted: true, // This is a deletion event + result: null, + pageId: 123, // Page ID to delete + totalDiscovered: 0, + }; + await progressCallback(progress); + }, + ); + + // Simulate deletePage failing + (mockStore.deletePage as Mock).mockRejectedValue(deletionError); + + // Execute the job - should fail due to deletion error + await expect(worker.executeJob(mockJob, mockCallbacks)).rejects.toThrow( + "Database deletion failed", + ); + + // Verify scrape was called + expect(mockScraperService.scrape).toHaveBeenCalledOnce(); + // Verify deletion was attempted + expect(mockStore.deletePage).toHaveBeenCalledWith(123); + // Verify onJobProgress was called + expect(mockCallbacks.onJobProgress).toHaveBeenCalledOnce(); + // Verify onJobError was called with the deletion error + expect(mockCallbacks.onJobError).toHaveBeenCalledOnce(); + expect(mockCallbacks.onJobError).toHaveBeenCalledWith(mockJob, deletionError); + // Verify addScrapeResult was NOT called (deletion failed before that) + expect(mockStore.addScrapeResult).not.toHaveBeenCalled(); + }); + + describe("Database operations based on fetch status", () => { + it("should perform NO database writes for a 304 Not Modified status", async () => { + // Simulate scrape yielding a 304 Not Modified event + (mockScraperService.scrape as Mock).mockImplementation( + async (_options, progressCallback, _signal) => { + const progress: ScraperProgressEvent = { + pagesScraped: 1, + totalPages: 1, + currentUrl: "url1", + depth: 1, + maxDepth: 1, + result: null, // No result for 304 + deleted: false, + pageId: 123, // Page ID from refresh queue + totalDiscovered: 0, + }; + await progressCallback(progress); + }, + ); + + await worker.executeJob(mockJob, mockCallbacks); + + // Verify NO database operations were performed + expect(mockStore.deletePage).not.toHaveBeenCalled(); + expect(mockStore.addScrapeResult).not.toHaveBeenCalled(); + + // Verify progress was still reported + expect(mockCallbacks.onJobProgress).toHaveBeenCalledOnce(); + expect(mockCallbacks.onJobProgress).toHaveBeenCalledWith( + mockJob, + expect.objectContaining({ + result: null, + deleted: false, + pageId: 123, + }), + ); + }); + + it("should DELETE existing documents and INSERT new ones for a 200 OK status on an existing page", async () => { + const mockResult: ScrapeResult = { + textContent: "updated content", + url: "url1", + title: "Updated Doc", + contentType: "text/html", + chunks: [], + links: [], + errors: [], + }; + + // Simulate scrape yielding a 200 OK event with pageId (existing page) + (mockScraperService.scrape as Mock).mockImplementation( + async (_options, progressCallback, _signal) => { + const progress: ScraperProgressEvent = { + pagesScraped: 1, + totalPages: 1, + currentUrl: "url1", + depth: 1, + maxDepth: 1, + result: mockResult, + pageId: 123, // Existing page ID + totalDiscovered: 0, + }; + await progressCallback(progress); + }, + ); + + await worker.executeJob(mockJob, mockCallbacks); + + // Verify DELETE was called first + expect(mockStore.deletePage).toHaveBeenCalledOnce(); + expect(mockStore.deletePage).toHaveBeenCalledWith(123); + + // Verify INSERT (addScrapeResult) was called after deletion + expect(mockStore.addScrapeResult).toHaveBeenCalledOnce(); + expect(mockStore.addScrapeResult).toHaveBeenCalledWith( + mockJob.library, + mockJob.version, + 1, + mockResult, + ); + + // Verify call order: delete before add + const deleteCallOrder = (mockStore.deletePage as Mock).mock.invocationCallOrder[0]; + const addCallOrder = (mockStore.addScrapeResult as Mock).mock + .invocationCallOrder[0]; + expect(deleteCallOrder).toBeLessThan(addCallOrder); + + // Verify progress was reported + expect(mockCallbacks.onJobProgress).toHaveBeenCalledOnce(); + }); + + it("should INSERT new documents for a 200 OK status on a new page", async () => { + const mockResult: ScrapeResult = { + textContent: "new content", + url: "url2", + title: "New Doc", + contentType: "text/html", + chunks: [], + links: [], + errors: [], + }; + + // Simulate scrape yielding a 200 OK event without pageId (new page) + (mockScraperService.scrape as Mock).mockImplementation( + async (_options, progressCallback, _signal) => { + const progress: ScraperProgressEvent = { + pagesScraped: 1, + totalPages: 1, + currentUrl: "url2", + depth: 1, + maxDepth: 1, + result: mockResult, + pageId: undefined, // No pageId = new page + totalDiscovered: 0, + }; + await progressCallback(progress); + }, + ); + + await worker.executeJob(mockJob, mockCallbacks); + + // Verify NO deletion was performed (new page) + expect(mockStore.deletePage).not.toHaveBeenCalled(); + + // Verify INSERT (addScrapeResult) was called + expect(mockStore.addScrapeResult).toHaveBeenCalledOnce(); + expect(mockStore.addScrapeResult).toHaveBeenCalledWith( + mockJob.library, + mockJob.version, + 1, + mockResult, + ); + + // Verify progress was reported + expect(mockCallbacks.onJobProgress).toHaveBeenCalledOnce(); + }); + + it("should call deletePage for a 404 Not Found status", async () => { + // Simulate scrape yielding a 404 Not Found event + (mockScraperService.scrape as Mock).mockImplementation( + async (_options, progressCallback, _signal) => { + const progress: ScraperProgressEvent = { + pagesScraped: 1, + totalPages: 1, + currentUrl: "url1", + depth: 1, + maxDepth: 1, + result: null, + deleted: true, // 404 - page was deleted + pageId: 123, + totalDiscovered: 0, + }; + await progressCallback(progress); + }, + ); + + await worker.executeJob(mockJob, mockCallbacks); + + // Verify deletion was called + expect(mockStore.deletePage).toHaveBeenCalledOnce(); + expect(mockStore.deletePage).toHaveBeenCalledWith(123); + + // Verify NO insert was performed + expect(mockStore.addScrapeResult).not.toHaveBeenCalled(); + + // Verify progress was reported + expect(mockCallbacks.onJobProgress).toHaveBeenCalledOnce(); + expect(mockCallbacks.onJobProgress).toHaveBeenCalledWith( + mockJob, + expect.objectContaining({ + deleted: true, + pageId: 123, + }), + ); + }); + }); }); diff --git a/src/pipeline/PipelineWorker.ts b/src/pipeline/PipelineWorker.ts index ae3c3977..aad6d92f 100644 --- a/src/pipeline/PipelineWorker.ts +++ b/src/pipeline/PipelineWorker.ts @@ -1,5 +1,5 @@ import type { ScraperService } from "../scraper"; -import type { ScraperProgress } from "../scraper/types"; +import type { ScraperProgressEvent } from "../scraper/types"; import type { DocumentManagementService } from "../store"; import { logger } from "../utils/logger"; import { CancellationError } from "./errors"; @@ -29,37 +29,29 @@ export class PipelineWorker { job: InternalPipelineJob, callbacks: PipelineManagerCallbacks, ): Promise { - const { - id: jobId, - library, - version, - sourceUrl, - scraperOptions, - abortController, - } = job; + const { id: jobId, library, version, scraperOptions, abortController } = job; const signal = abortController.signal; logger.debug(`[${jobId}] Worker starting job for ${library}@${version}`); try { // Clear existing documents for this library/version before scraping - await this.store.removeAllDocuments(library, version); - logger.info( - `💾 Cleared store for ${library}@${version || "[no version]"} before scraping.`, - ); - - // Construct runtime options from job context + stored configuration - const runtimeOptions = { - url: sourceUrl ?? "", - library, - version, - ...scraperOptions, - }; + // Skip this step for refresh operations to preserve existing data + if (!scraperOptions.isRefresh) { + await this.store.removeAllDocuments(library, version); + logger.info( + `💾 Cleared store for ${library}@${version || "[no version]"} before scraping.`, + ); + } else { + logger.info( + `🔄 Refresh operation - preserving existing data for ${library}@${version || "[no version]"}.`, + ); + } // --- Core Job Logic --- await this.scraperService.scrape( - runtimeOptions, - async (progress: ScraperProgress) => { + scraperOptions, + async (progress: ScraperProgressEvent) => { // Check for cancellation signal before processing each document if (signal.aborted) { throw new CancellationError("Job cancelled during scraping progress"); @@ -69,27 +61,55 @@ export class PipelineWorker { // Report progress via manager's callback (single source of truth) await callbacks.onJobProgress?.(job, progress); - if (progress.document) { + // Handle deletion events (404 during refresh or broken links) + if (progress.deleted && progress.pageId) { try { - await this.store.addDocument(library, version, { - pageContent: progress.document.content, - metadata: { - ...progress.document.metadata, - mimeType: progress.document.contentType, // Pass contentType as mimeType in metadata - }, - }); + await this.store.deletePage(progress.pageId); logger.debug( - `[${jobId}] Stored document: ${progress.document.metadata.url}`, + `[${jobId}] Deleted page ${progress.pageId}: ${progress.currentUrl}`, + ); + } catch (docError) { + logger.error( + `❌ [${jobId}] Failed to delete page ${progress.pageId}: ${docError}`, + ); + + // Report the error and fail the job to ensure data integrity + const error = + docError instanceof Error ? docError : new Error(String(docError)); + await callbacks.onJobError?.(job, error); + // Re-throw to fail the job - deletion failures indicate serious database issues + // and leaving orphaned documents would compromise index accuracy + throw error; + } + } + // Handle successful content processing + else if (progress.result) { + try { + // For refresh operations, delete old documents before adding new ones + if (progress.pageId) { + await this.store.deletePage(progress.pageId); + logger.debug( + `[${jobId}] Refreshing page ${progress.pageId}: ${progress.currentUrl}`, + ); + } + + // Add the processed content to the store + await this.store.addScrapeResult( + library, + version, + progress.depth, + progress.result, ); + logger.debug(`[${jobId}] Stored processed content: ${progress.currentUrl}`); } catch (docError) { logger.error( - `❌ [${jobId}] Failed to store document ${progress.document.metadata.url}: ${docError}`, + `❌ [${jobId}] Failed to process content ${progress.currentUrl}: ${docError}`, ); // Report document-specific errors via manager's callback await callbacks.onJobError?.( job, docError instanceof Error ? docError : new Error(String(docError)), - progress.document, + progress.result, ); // Decide if a single document error should fail the whole job // For now, we log and continue. To fail, re-throw here. diff --git a/src/pipeline/trpc/interfaces.ts b/src/pipeline/trpc/interfaces.ts index 9d806b22..0fd18173 100644 --- a/src/pipeline/trpc/interfaces.ts +++ b/src/pipeline/trpc/interfaces.ts @@ -20,11 +20,12 @@ export interface PipelineOptions { export interface IPipeline { start(): Promise; stop(): Promise; - enqueueJob( + enqueueScrapeJob( library: string, version: string | undefined | null, options: ScraperOptions, ): Promise; + enqueueRefreshJob(library: string, version: string | undefined | null): Promise; getJob(jobId: string): Promise; getJobs(status?: PipelineJobStatus): Promise; cancelJob(jobId: string): Promise; diff --git a/src/pipeline/trpc/router.ts b/src/pipeline/trpc/router.ts index d7a41cd6..26d218fd 100644 --- a/src/pipeline/trpc/router.ts +++ b/src/pipeline/trpc/router.ts @@ -31,12 +31,17 @@ const optionalTrimmed = z.preprocess( z.string().min(1).optional().nullable(), ); -const enqueueInput = z.object({ +const enqueueScrapeInput = z.object({ library: nonEmptyTrimmed, version: optionalTrimmed, options: z.custom(), }); +const enqueueRefreshInput = z.object({ + library: nonEmptyTrimmed, + version: optionalTrimmed, +}); + const jobIdInput = z.object({ id: z.string().min(1) }); const getJobsInput = z.object({ @@ -47,17 +52,17 @@ const getJobsInput = z.object({ export function createPipelineRouter(trpc: unknown) { const tt = trpc as typeof t; return tt.router({ - enqueueJob: tt.procedure - .input(enqueueInput) + enqueueScrapeJob: tt.procedure + .input(enqueueScrapeInput) .mutation( async ({ ctx, input, }: { ctx: PipelineTrpcContext; - input: z.infer; + input: z.infer; }) => { - const jobId = await ctx.pipeline.enqueueJob( + const jobId = await ctx.pipeline.enqueueScrapeJob( input.library, input.version ?? null, input.options, @@ -83,6 +88,25 @@ export function createPipelineRouter(trpc: unknown) { }, ), + enqueueRefreshJob: tt.procedure + .input(enqueueRefreshInput) + .mutation( + async ({ + ctx, + input, + }: { + ctx: PipelineTrpcContext; + input: z.infer; + }) => { + const jobId = await ctx.pipeline.enqueueRefreshJob( + input.library, + input.version ?? null, + ); + + return { jobId }; + }, + ), + getJob: tt.procedure .input(jobIdInput) .query( diff --git a/src/pipeline/types.ts b/src/pipeline/types.ts index 4e3e5b43..2ac6f893 100644 --- a/src/pipeline/types.ts +++ b/src/pipeline/types.ts @@ -1,6 +1,9 @@ -import type { ScraperProgress } from "../scraper/types"; -import type { VersionScraperOptions, VersionStatus } from "../store/types"; -import type { Document } from "../types"; // Use local Document type +import type { + ScrapeResult, + ScraperOptions, + ScraperProgressEvent, +} from "../scraper/types"; +import type { VersionStatus } from "../store/types"; /** * Represents the possible states of a pipeline job. @@ -28,7 +31,7 @@ export interface PipelineJob { /** Current pipeline status of the job. */ status: PipelineJobStatus; /** Detailed progress information. */ - progress: ScraperProgress | null; + progress: ScraperProgressEvent | null; /** Error information if the job failed. */ error: { message: string } | null; /** Timestamp when the job was created. */ @@ -52,18 +55,24 @@ export interface PipelineJob { /** Original scraping URL. */ sourceUrl: string | null; /** Stored scraper options for reproducibility. */ - scraperOptions: VersionScraperOptions | null; + scraperOptions: ScraperOptions | null; } /** * Internal pipeline job representation used within PipelineManager. * Contains non-serializable fields for job management and control. + * + * Note: scraperOptions is required (non-nullable) for internal jobs as they + * always have complete runtime configuration available. */ -export interface InternalPipelineJob extends Omit { +export interface InternalPipelineJob + extends Omit { /** The library version associated with the job (internal uses string). */ version: string; /** Error object if the job failed. */ error: Error | null; + /** Complete scraper options with runtime configuration. */ + scraperOptions: ScraperOptions; /** AbortController to signal cancellation. */ abortController: AbortController; /** Promise that resolves/rejects when the job finishes. */ @@ -82,11 +91,14 @@ export interface PipelineManagerCallbacks { /** Callback triggered when a job's status changes. */ onJobStatusChange?: (job: InternalPipelineJob) => Promise; /** Callback triggered when a job makes progress. */ - onJobProgress?: (job: InternalPipelineJob, progress: ScraperProgress) => Promise; + onJobProgress?: ( + job: InternalPipelineJob, + progress: ScraperProgressEvent, + ) => Promise; /** Callback triggered when a job encounters an error during processing (e.g., storing a doc). */ onJobError?: ( job: InternalPipelineJob, error: Error, - document?: Document, + page?: ScrapeResult, ) => Promise; } diff --git a/src/scraper/ScraperRegistry.ts b/src/scraper/ScraperRegistry.ts index 5d8f4846..9248deec 100644 --- a/src/scraper/ScraperRegistry.ts +++ b/src/scraper/ScraperRegistry.ts @@ -2,7 +2,6 @@ import { logger } from "../utils"; import { ScraperError } from "../utils/errors"; import { validateUrl } from "../utils/url"; import { GitHubScraperStrategy } from "./strategies/GitHubScraperStrategy"; -import { GitHubWikiScraperStrategy } from "./strategies/GitHubWikiScraperStrategy"; import { LocalFileStrategy } from "./strategies/LocalFileStrategy"; import { NpmScraperStrategy } from "./strategies/NpmScraperStrategy"; import { PyPiScraperStrategy } from "./strategies/PyPiScraperStrategy"; @@ -16,7 +15,6 @@ export class ScraperRegistry { this.strategies = [ new NpmScraperStrategy(), new PyPiScraperStrategy(), - new GitHubWikiScraperStrategy(), new GitHubScraperStrategy(), new WebScraperStrategy(), new LocalFileStrategy(), diff --git a/src/scraper/ScraperService.test.ts b/src/scraper/ScraperService.test.ts index 6c490284..8faa842b 100644 --- a/src/scraper/ScraperService.test.ts +++ b/src/scraper/ScraperService.test.ts @@ -3,7 +3,7 @@ import type { ProgressCallback } from "../types"; import { ScraperError } from "../utils/errors"; import type { ScraperRegistry } from "./ScraperRegistry"; import { ScraperService } from "./ScraperService"; -import type { ScraperOptions, ScraperProgress } from "./types"; +import type { ScraperOptions, ScraperProgressEvent } from "./types"; vi.mock("../utils/logger"); @@ -27,7 +27,7 @@ describe("ScraperService", () => { maxPages: 10, maxDepth: 1, }; - const progressCallback: ProgressCallback = vi.fn(); + const progressCallback: ProgressCallback = vi.fn(); mockRegistry.getStrategy.mockReturnValue(mockStrategy); // Call scrape without a signal (it's optional) @@ -51,7 +51,7 @@ describe("ScraperService", () => { maxPages: 10, maxDepth: 1, }; - const progressCallback: ProgressCallback = vi.fn(); + const progressCallback: ProgressCallback = vi.fn(); mockRegistry.getStrategy.mockReturnValue(mockStrategy); // Call scrape without a signal @@ -74,7 +74,7 @@ describe("ScraperService", () => { maxPages: 10, maxDepth: 1, }; - const progressCallback: ProgressCallback = vi.fn(); + const progressCallback: ProgressCallback = vi.fn(); mockRegistry.getStrategy.mockReturnValue(mockStrategy); // Call scrape without a signal @@ -98,7 +98,7 @@ describe("ScraperService", () => { maxPages: 10, maxDepth: 1, }; - const progressCallback: ProgressCallback = vi.fn(); + const progressCallback: ProgressCallback = vi.fn(); mockRegistry.getStrategy.mockReturnValue(null); @@ -117,7 +117,7 @@ describe("ScraperService", () => { maxPages: 10, maxDepth: 1, }; - const progressCallback: ProgressCallback = vi.fn(); + const progressCallback: ProgressCallback = vi.fn(); mockRegistry.getStrategy.mockReturnValue(mockStrategy); mockStrategy.scrape.mockRejectedValue(new Error("Strategy error")); @@ -138,7 +138,7 @@ describe("ScraperService", () => { maxPages: 1, maxDepth: 1, }; - const progressCallback: ProgressCallback = vi.fn(); + const progressCallback: ProgressCallback = vi.fn(); // Mock a strategy that would handle JSON files const jsonStrategy = { diff --git a/src/scraper/ScraperService.ts b/src/scraper/ScraperService.ts index 853b3f12..c88e5334 100644 --- a/src/scraper/ScraperService.ts +++ b/src/scraper/ScraperService.ts @@ -1,7 +1,7 @@ import type { ProgressCallback } from "../types"; import { ScraperError } from "../utils/errors"; import type { ScraperRegistry } from "./ScraperRegistry"; -import type { ScraperOptions, ScraperProgress } from "./types"; +import type { ScraperOptions, ScraperProgressEvent } from "./types"; /** * Orchestrates document scraping operations using registered scraping strategies. @@ -20,7 +20,7 @@ export class ScraperService { */ async scrape( options: ScraperOptions, - progressCallback: ProgressCallback, + progressCallback: ProgressCallback, signal?: AbortSignal, // Add optional signal parameter ): Promise { // Find strategy for this URL diff --git a/src/scraper/fetcher/BrowserFetcher.ts b/src/scraper/fetcher/BrowserFetcher.ts index 27446da3..37e8ffde 100644 --- a/src/scraper/fetcher/BrowserFetcher.ts +++ b/src/scraper/fetcher/BrowserFetcher.ts @@ -3,7 +3,12 @@ import { ScraperError } from "../../utils/errors"; import { logger } from "../../utils/logger"; import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; import { FingerprintGenerator } from "./FingerprintGenerator"; -import type { ContentFetcher, FetchOptions, RawContent } from "./types"; +import { + type ContentFetcher, + type FetchOptions, + FetchStatus, + type RawContent, +} from "./types"; /** * Fetches content using a headless browser (Playwright). @@ -72,12 +77,17 @@ export class BrowserFetcher implements ContentFetcher { const contentType = response.headers()["content-type"] || "text/html"; const { mimeType, charset } = MimeTypeUtils.parseContentType(contentType); + // Extract ETag header for caching + const etag = response.headers().etag; + return { content: contentBuffer, mimeType, charset, encoding: undefined, // Browser handles encoding automatically source: finalUrl, + etag, + status: FetchStatus.SUCCESS, } satisfies RawContent; } catch (error) { if (options?.signal?.aborted) { diff --git a/src/scraper/fetcher/FileFetcher.test.ts b/src/scraper/fetcher/FileFetcher.test.ts index 546618d6..7a96497c 100644 --- a/src/scraper/fetcher/FileFetcher.test.ts +++ b/src/scraper/fetcher/FileFetcher.test.ts @@ -43,79 +43,59 @@ describe("FileFetcher", () => { expect(result.mimeType).toBe("text/html"); }); - it("should detect source code MIME types correctly", async () => { + it.each([ + [".ts", "text/x-typescript", "interface User { name: string; }"], + [".tsx", "text/x-tsx", "export const App = () =>
Hello
;"], + [".py", "text/x-python", "def hello(): print('world')"], + [".go", "text/x-go", "package main\nfunc main() {}"], + [".rs", "text/x-rust", 'fn main() { println!("Hello"); }'], + [".kt", "text/x-kotlin", 'fun main() { println("Hello") }'], + [".rb", "text/x-ruby", "puts 'Hello world'"], + [".js", "text/javascript", "console.log('Hello');"], + [".css", "text/css", "body { margin: 0; }"], + [".json", "application/json", '{"name": "test"}'], + [".xml", "application/xml", ""], + [".md", "text/markdown", "# Hello"], + [".sh", "text/x-shellscript", "#!/bin/bash\necho hello"], + ])("should detect %s files as %s", async (extension, expectedMimeType, content) => { const fetcher = new FileFetcher(); - const files = { - "/code/app.ts": "interface User { name: string; }", - "/code/component.tsx": "export const App = () =>
Hello
;", - "/code/script.py": "def hello(): print('world')", - "/code/main.go": "package main\nfunc main() {}", - "/code/lib.rs": 'fn main() { println!("Hello"); }', - "/code/App.kt": 'fun main() { println("Hello") }', - "/code/script.rb": "puts 'Hello world'", - "/code/index.js": "console.log('Hello');", - "/code/style.css": "body { margin: 0; }", - "/code/data.json": '{"name": "test"}', - "/code/config.xml": "", - "/code/readme.md": "# Hello", - "/code/script.sh": "#!/bin/bash\necho hello", - }; - - vol.fromJSON(files); - - // Test TypeScript files - const tsResult = await fetcher.fetch("file:///code/app.ts"); - expect(tsResult.mimeType).toBe("text/x-typescript"); - - const tsxResult = await fetcher.fetch("file:///code/component.tsx"); - expect(tsxResult.mimeType).toBe("text/x-tsx"); - - // Test Python files - const pyResult = await fetcher.fetch("file:///code/script.py"); - expect(pyResult.mimeType).toBe("text/x-python"); - - // Test Go files - const goResult = await fetcher.fetch("file:///code/main.go"); - expect(goResult.mimeType).toBe("text/x-go"); - - // Test Rust files - const rsResult = await fetcher.fetch("file:///code/lib.rs"); - expect(rsResult.mimeType).toBe("text/x-rust"); - - // Test Kotlin files - const ktResult = await fetcher.fetch("file:///code/App.kt"); - expect(ktResult.mimeType).toBe("text/x-kotlin"); - - // Test Ruby files - const rbResult = await fetcher.fetch("file:///code/script.rb"); - expect(rbResult.mimeType).toBe("text/x-ruby"); - - // Test JavaScript files (fallback to mime package) - const jsResult = await fetcher.fetch("file:///code/index.js"); - expect(jsResult.mimeType).toBe("text/javascript"); - - // Test shell scripts - const shResult = await fetcher.fetch("file:///code/script.sh"); - expect(shResult.mimeType).toBe("text/x-shellscript"); - - // Test other file types (fallback to mime package) - const cssResult = await fetcher.fetch("file:///code/style.css"); - expect(cssResult.mimeType).toBe("text/css"); - - const jsonResult = await fetcher.fetch("file:///code/data.json"); - expect(jsonResult.mimeType).toBe("application/json"); - - const xmlResult = await fetcher.fetch("file:///code/config.xml"); - expect(xmlResult.mimeType).toBe("application/xml"); - - const mdResult = await fetcher.fetch("file:///code/readme.md"); - expect(mdResult.mimeType).toBe("text/markdown"); + const fileName = `/code/file${extension}`; + + vol.fromJSON({ + [fileName]: content, + }); + + const result = await fetcher.fetch(`file://${fileName}`); + expect(result.mimeType).toBe(expectedMimeType); }); - it("should throw error if file does not exist", async () => { + it("should return status NOT_FOUND if file does not exist", async () => { const fetcher = new FileFetcher(); - await expect(fetcher.fetch("file:///path/to/file.txt")).rejects.toThrow(ScraperError); + const result = await fetcher.fetch("file:///path/to/nonexistent-file.txt"); + expect(result.status).toBe("not_found"); + }); + + it("should throw ScraperError for other file system errors", async () => { + const fetcher = new FileFetcher(); + const filePath = "/path/to/permission-denied.txt"; + + // Create the file in the virtual filesystem first + vol.fromJSON({ + [filePath]: "test content", + }); + + // Simulate a permission error by mocking stat to succeed but readFile to fail + const permissionError = new Error("EACCES: permission denied"); + (permissionError as NodeJS.ErrnoException).code = "EACCES"; + const readFileSpy = vi + .spyOn(vol.promises, "readFile") + .mockRejectedValue(permissionError); + + await expect(fetcher.fetch(`file://${filePath}`)).rejects.toThrow(ScraperError); + + // Restore the spy + readFileSpy.mockRestore(); }); it("should only handle file protocol", async () => { @@ -200,4 +180,81 @@ describe("FileFetcher", () => { expect(result.mimeType).toBe("text/plain"); expect(result.source).toBe("file://Users/testuser/foo/bar/file.txt"); }); + + describe("File status detection for refresh", () => { + beforeEach(() => { + vol.reset(); + }); + + it("should return NOT_MODIFIED when fetching an unchanged file with its etag", async () => { + const fetcher = new FileFetcher(); + const filePath = "/test/unchanged.txt"; + + vol.fromJSON({ + [filePath]: "content", + }); + + // First fetch to get the ETag + const result1 = await fetcher.fetch(`file://${filePath}`); + const etag = result1.etag; + + // Second fetch with the same ETag should return NOT_MODIFIED + const result2 = await fetcher.fetch(`file://${filePath}`, { etag }); + + expect(result2.status).toBe("not_modified"); + expect(result2.etag).toBe(etag); + expect(result2.content).toEqual(Buffer.from("")); + }); + + it("should return SUCCESS when fetching a modified file with its old etag", async () => { + const fetcher = new FileFetcher(); + const filePath = "/test/modified.txt"; + + // Create initial file + vol.fromJSON({ + [filePath]: "initial", + }); + + const result1 = await fetcher.fetch(`file://${filePath}`); + const oldEtag = result1.etag; + + // Wait and modify file + await new Promise((resolve) => setTimeout(resolve, 10)); + vol.fromJSON({ + [filePath]: "modified", + }); + + // Fetch with old ETag should detect change and return SUCCESS + const result2 = await fetcher.fetch(`file://${filePath}`, { etag: oldEtag }); + + expect(result2.status).toBe("success"); + expect(result2.etag).not.toBe(oldEtag); + expect(result2.content.toString()).toBe("modified"); + }); + + it("should return NOT_FOUND when the file has been deleted", async () => { + const fetcher = new FileFetcher(); + + const result = await fetcher.fetch("file:///test/does-not-exist.txt"); + + expect(result.status).toBe("not_found"); + expect(result.content).toEqual(Buffer.from("")); + }); + + it("should return SUCCESS when fetching a new file without an etag", async () => { + const fetcher = new FileFetcher(); + const filePath = "/test/file.txt"; + + vol.fromJSON({ + [filePath]: "content", + }); + + // Fetch without etag should always return SUCCESS + const result = await fetcher.fetch(`file://${filePath}`); + + expect(result.status).toBe("success"); + expect(result.etag).toBeTruthy(); + expect(result.content.toString()).toBe("content"); + }); + }); }); diff --git a/src/scraper/fetcher/FileFetcher.ts b/src/scraper/fetcher/FileFetcher.ts index 219c9602..f7a6129c 100644 --- a/src/scraper/fetcher/FileFetcher.ts +++ b/src/scraper/fetcher/FileFetcher.ts @@ -1,7 +1,13 @@ +import crypto from "node:crypto"; import fs from "node:fs/promises"; import { ScraperError } from "../../utils/errors"; import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; -import type { ContentFetcher, FetchOptions, RawContent } from "./types"; +import { + type ContentFetcher, + type FetchOptions, + FetchStatus, + type RawContent, +} from "./types"; /** * Fetches content from local file system. @@ -14,8 +20,9 @@ export class FileFetcher implements ContentFetcher { /** * Fetches the content of a file given a file:// URL, decoding percent-encoded paths as needed. * Uses enhanced MIME type detection for better source code file recognition. + * Supports conditional fetching via ETag comparison for efficient refresh operations. */ - async fetch(source: string, _options?: FetchOptions): Promise { + async fetch(source: string, options?: FetchOptions): Promise { // Remove the file:// protocol prefix and handle both file:// and file:/// formats let filePath = source.replace(/^file:\/\/\/?/, ""); @@ -28,6 +35,28 @@ export class FileFetcher implements ContentFetcher { } try { + const stats = await fs.stat(filePath); + + // Generate current ETag from last modified time + const currentEtag = crypto + .createHash("md5") + .update(stats.mtime.toISOString()) + .digest("hex"); + + // Check if file has been modified (ETag comparison) + if (options?.etag && options.etag === currentEtag) { + // File hasn't changed - return NOT_MODIFIED status + return { + content: Buffer.from(""), + mimeType: "text/plain", + source, + etag: currentEtag, + lastModified: stats.mtime.toISOString(), + status: FetchStatus.NOT_MODIFIED, + }; + } + + // File is new or has been modified - read the content const content = await fs.readFile(filePath); // Use enhanced MIME type detection that properly handles source code files @@ -38,9 +67,22 @@ export class FileFetcher implements ContentFetcher { content, mimeType, source, + etag: currentEtag, + lastModified: stats.mtime.toISOString(), + status: FetchStatus.SUCCESS, // Don't assume charset for text files - let the pipeline detect it }; } catch (error: unknown) { + // Check for file not found error + if ((error as NodeJS.ErrnoException).code === "ENOENT") { + return { + content: Buffer.from(""), + mimeType: "text/plain", + source, + status: FetchStatus.NOT_FOUND, + }; + } + // For all other errors, throw a ScraperError throw new ScraperError( `Failed to read file ${filePath}: ${ (error as { message?: string }).message ?? "Unknown error" diff --git a/src/scraper/fetcher/HttpFetcher.test.ts b/src/scraper/fetcher/HttpFetcher.test.ts index 5fc9e29f..3330c7c0 100644 --- a/src/scraper/fetcher/HttpFetcher.test.ts +++ b/src/scraper/fetcher/HttpFetcher.test.ts @@ -267,9 +267,8 @@ describe("HttpFetcher", () => { }); describe("retry logic", () => { - it("should retry on all retryable HTTP status codes", async () => { + it("should retry on retryable status codes [408, 429, 500, 502, 503, 504, 525]", async () => { const fetcher = new HttpFetcher(); - // Test all retryable status codes from HttpFetcher: 408, 429, 500, 502, 503, 504, 525 const retryableStatuses = [408, 429, 500, 502, 503, 504, 525]; for (const status of retryableStatuses) { @@ -290,10 +289,9 @@ describe("HttpFetcher", () => { } }); - it("should not retry on non-retryable HTTP status codes", async () => { + it("should not retry on non-retryable status codes [400, 401, 403, 404, 405, 410]", async () => { const fetcher = new HttpFetcher(); - // Test various non-retryable status codes - const nonRetryableStatuses = [400, 401, 403, 404, 405, 410]; + const nonRetryableStatuses = [400, 401, 403, 405, 410]; for (const status of nonRetryableStatuses) { mockedAxios.get.mockReset(); @@ -308,106 +306,19 @@ describe("HttpFetcher", () => { expect(mockedAxios.get).toHaveBeenCalledTimes(1); // No retries } - }); - it("should retry on undefined status (network errors)", async () => { - const fetcher = new HttpFetcher(); - // Simulate network error without response object - mockedAxios.get.mockRejectedValueOnce(new Error("Network timeout")); - mockedAxios.get.mockResolvedValueOnce({ - data: Buffer.from("recovered", "utf-8"), - headers: { "content-type": "text/plain" }, - }); + // 404 has special handling - returns result instead of throwing + mockedAxios.get.mockReset(); + mockedAxios.get.mockRejectedValue({ response: { status: 404 } }); const result = await fetcher.fetch("https://example.com", { - maxRetries: 1, + maxRetries: 2, retryDelay: 1, }); - expect(result.content).toEqual(Buffer.from("recovered", "utf-8")); - expect(mockedAxios.get).toHaveBeenCalledTimes(2); - }); - - it("should use exponential backoff for retry delays", async () => { - const fetcher = new HttpFetcher(); - // Mock setTimeout to spy on delay behavior without actually waiting - const setTimeoutSpy = vi.spyOn(global, "setTimeout"); - - // Mock all retries to fail, then succeed - mockedAxios.get.mockRejectedValueOnce({ response: { status: 500 } }); - mockedAxios.get.mockRejectedValueOnce({ response: { status: 500 } }); - mockedAxios.get.mockRejectedValueOnce({ response: { status: 500 } }); - mockedAxios.get.mockResolvedValueOnce({ - data: Buffer.from("success", "utf-8"), - headers: { "content-type": "text/plain" }, - }); - - // Execute fetch with base delay of 10ms - const baseDelay = 10; - await fetcher.fetch("https://example.com", { - maxRetries: 3, - retryDelay: baseDelay, - }); - - // Verify exponential backoff: baseDelay * 2^attempt - // Attempt 0: 10ms, Attempt 1: 20ms, Attempt 2: 40ms - expect(setTimeoutSpy).toHaveBeenCalledWith(expect.any(Function), 10); - expect(setTimeoutSpy).toHaveBeenCalledWith(expect.any(Function), 20); - expect(setTimeoutSpy).toHaveBeenCalledWith(expect.any(Function), 40); - - setTimeoutSpy.mockRestore(); - }); - }); - - it("should not retry on unretryable HTTP errors", async () => { - const fetcher = new HttpFetcher(); - mockedAxios.get.mockRejectedValue({ response: { status: 404 } }); - - await expect( - fetcher.fetch("https://example.com", { - retryDelay: 1, // Use minimal delay - }), - ).rejects.toThrow(ScraperError); - expect(mockedAxios.get).toHaveBeenCalledTimes(1); - }); - - it("should retry on retryable HTTP errors", async () => { - const fetcher = new HttpFetcher(); - const retryableErrors = [429, 500, 503]; - for (const status of retryableErrors) { - mockedAxios.get.mockRejectedValueOnce({ response: { status } }); - } - - const htmlContent = "

Hello

"; - mockedAxios.get.mockResolvedValueOnce({ - data: Buffer.from(htmlContent, "utf-8"), - headers: { "content-type": "text/html" }, - }); - - // Test behavior: retry mechanism should eventually succeed - const result = await fetcher.fetch("https://example.com", { - retryDelay: 1, // Use minimal delay to speed up test - maxRetries: 3, + expect(result.status).toBe("not_found"); + expect(mockedAxios.get).toHaveBeenCalledTimes(1); // No retries for 404 }); - - expect(mockedAxios.get).toHaveBeenCalledTimes(retryableErrors.length + 1); - expect(result.content).toEqual(Buffer.from(htmlContent, "utf-8")); - }); - - it("should throw error after max retries", async () => { - const fetcher = new HttpFetcher(); - const maxRetries = 2; // Use smaller number for faster test - - mockedAxios.get.mockRejectedValue({ response: { status: 502 } }); - - await expect( - fetcher.fetch("https://example.com", { - maxRetries: maxRetries, - retryDelay: 1, // Use minimal delay - }), - ).rejects.toThrow(ScraperError); - - expect(mockedAxios.get).toHaveBeenCalledTimes(maxRetries + 1); }); it("should generate fingerprint headers", async () => { @@ -421,20 +332,23 @@ describe("HttpFetcher", () => { await fetcher.fetch("https://example.com"); // Test behavior: verify that axios is called with required properties - expect(mockedAxios.get).toHaveBeenCalledWith("https://example.com", { - responseType: "arraybuffer", - headers: expect.objectContaining({ - "user-agent": expect.any(String), - accept: expect.any(String), - "accept-language": expect.any(String), - // Verify that our custom Accept-Encoding header is set (excluding zstd) - "Accept-Encoding": "gzip, deflate, br", + expect(mockedAxios.get).toHaveBeenCalledWith( + "https://example.com", + expect.objectContaining({ + responseType: "arraybuffer", + headers: expect.objectContaining({ + "user-agent": expect.any(String), + accept: expect.any(String), + "accept-language": expect.any(String), + // Verify that our custom Accept-Encoding header is set (excluding zstd) + "Accept-Encoding": "gzip, deflate, br", + }), + timeout: undefined, + maxRedirects: 5, + signal: undefined, + decompress: true, }), - timeout: undefined, - maxRedirects: 5, - signal: undefined, - decompress: true, - }); + ); }); it("should respect custom headers", async () => { @@ -449,14 +363,17 @@ describe("HttpFetcher", () => { await fetcher.fetch("https://example.com", { headers }); // Test behavior: verify custom headers are included - expect(mockedAxios.get).toHaveBeenCalledWith("https://example.com", { - responseType: "arraybuffer", - headers: expect.objectContaining(headers), - timeout: undefined, - maxRedirects: 5, - signal: undefined, - decompress: true, - }); + expect(mockedAxios.get).toHaveBeenCalledWith( + "https://example.com", + expect.objectContaining({ + responseType: "arraybuffer", + headers: expect.objectContaining(headers), + timeout: undefined, + maxRedirects: 5, + signal: undefined, + decompress: true, + }), + ); }); describe("redirect handling", () => { @@ -574,4 +491,108 @@ describe("HttpFetcher", () => { expect(result.source).toBe(finalUrl); }); }); + + describe("Conditional request headers", () => { + beforeEach(() => { + mockedAxios.get.mockReset(); + }); + + it("should send If-None-Match header when etag is provided", async () => { + const fetcher = new HttpFetcher(); + const mockResponse = { + data: Buffer.from("content", "utf-8"), + headers: { "content-type": "text/plain" }, + }; + mockedAxios.get.mockResolvedValue(mockResponse); + + await fetcher.fetch("https://example.com", { etag: '"abc123"' }); + + expect(mockedAxios.get).toHaveBeenCalledWith( + "https://example.com", + expect.objectContaining({ + headers: expect.objectContaining({ + "If-None-Match": '"abc123"', + }), + }), + ); + }); + + it("should NOT send If-None-Match header when etag is not provided", async () => { + const fetcher = new HttpFetcher(); + const mockResponse = { + data: Buffer.from("content", "utf-8"), + headers: { "content-type": "text/plain" }, + }; + mockedAxios.get.mockResolvedValue(mockResponse); + + await fetcher.fetch("https://example.com"); + + expect(mockedAxios.get).toHaveBeenCalledWith( + "https://example.com", + expect.objectContaining({ + headers: expect.not.objectContaining({ + "If-None-Match": expect.anything(), + }), + }), + ); + }); + }); + + describe("304 Not Modified response handling", () => { + beforeEach(() => { + mockedAxios.get.mockReset(); + }); + + it("should handle 304 responses with status='not_modified', empty content, and no retry", async () => { + const fetcher = new HttpFetcher(); + const etag = '"cached-etag-123"'; + + // 304 is treated as successful by validateStatus, so axios resolves (not rejects) + mockedAxios.get.mockResolvedValue({ + status: 304, + data: Buffer.from(""), // 304 typically has no body + headers: { etag }, + config: {}, + statusText: "Not Modified", + }); + + const result = await fetcher.fetch("https://example.com", { etag }); + + expect(result.status).toBe("not_modified"); + expect(result.etag).toBeUndefined(); // 304 response doesn't extract etag from headers + expect(result.content).toEqual(Buffer.from("")); + expect(mockedAxios.get).toHaveBeenCalledTimes(1); // No retries for 304 + }); + }); + + describe("ETag extraction from responses", () => { + beforeEach(() => { + mockedAxios.get.mockReset(); + }); + + it("should extract etag from response headers (or undefined if missing)", async () => { + const fetcher = new HttpFetcher(); + const etag = '"response-etag-456"'; + + // Test with etag present + mockedAxios.get.mockResolvedValue({ + data: Buffer.from("content", "utf-8"), + headers: { "content-type": "text/plain", etag }, + }); + + const resultWithEtag = await fetcher.fetch("https://example.com"); + expect(resultWithEtag.etag).toBe(etag); + + mockedAxios.get.mockReset(); + + // Test with etag missing + mockedAxios.get.mockResolvedValue({ + data: Buffer.from("content", "utf-8"), + headers: { "content-type": "text/plain" }, + }); + + const resultWithoutEtag = await fetcher.fetch("https://example.com"); + expect(resultWithoutEtag.etag).toBeUndefined(); + }); + }); }); diff --git a/src/scraper/fetcher/HttpFetcher.ts b/src/scraper/fetcher/HttpFetcher.ts index 3e1da077..65652633 100644 --- a/src/scraper/fetcher/HttpFetcher.ts +++ b/src/scraper/fetcher/HttpFetcher.ts @@ -6,7 +6,12 @@ import { ChallengeError, RedirectError, ScraperError } from "../../utils/errors" import { logger } from "../../utils/logger"; import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; import { FingerprintGenerator } from "./FingerprintGenerator"; -import type { ContentFetcher, FetchOptions, RawContent } from "./types"; +import { + type ContentFetcher, + type FetchOptions, + FetchStatus, + type RawContent, +} from "./types"; /** * Fetches content from remote sources using HTTP/HTTPS. @@ -116,11 +121,19 @@ export class HttpFetcher implements ContentFetcher { for (let attempt = 0; attempt <= maxRetries; attempt++) { try { const fingerprint = this.fingerprintGenerator.generateHeaders(); - const headers = { + const headers: Record = { ...fingerprint, ...options?.headers, // User-provided headers override generated ones }; + // Add If-None-Match header for conditional requests if ETag is provided + if (options?.etag) { + headers["If-None-Match"] = options.etag; + logger.debug( + `Conditional request for ${source} with If-None-Match: ${options.etag}`, + ); + } + const config: AxiosRequestConfig = { responseType: "arraybuffer", headers: { @@ -134,10 +147,25 @@ export class HttpFetcher implements ContentFetcher { // Axios follows redirects by default, we need to explicitly disable it if needed maxRedirects: followRedirects ? 5 : 0, decompress: true, + // Allow 304 responses to be handled as successful responses + validateStatus: (status) => { + return (status >= 200 && status < 300) || status === 304; + }, }; const response = await axios.get(source, config); + // Handle 304 Not Modified responses for conditional requests + if (response.status === 304) { + logger.debug(`HTTP 304 Not Modified for ${source}`); + return { + content: Buffer.from(""), + mimeType: "text/plain", + source: source, + status: FetchStatus.NOT_MODIFIED, + } satisfies RawContent; + } + const contentTypeHeader = response.headers["content-type"]; const { mimeType, charset } = MimeTypeUtils.parseContentType(contentTypeHeader); const contentEncoding = response.headers["content-encoding"]; @@ -165,12 +193,27 @@ export class HttpFetcher implements ContentFetcher { response.config?.url || source; + // Extract ETag header for caching + const etag = response.headers.etag || response.headers.ETag; + if (etag) { + logger.debug(`Received ETag for ${source}: ${etag}`); + } + + // Extract Last-Modified header for caching + const lastModified = response.headers["last-modified"]; + const lastModifiedISO = lastModified + ? new Date(lastModified).toISOString() + : undefined; + return { content, mimeType, charset, encoding: contentEncoding, source: finalUrl, + etag, + lastModified: lastModifiedISO, + status: FetchStatus.SUCCESS, } satisfies RawContent; } catch (error: unknown) { const axiosError = error as AxiosError; @@ -183,6 +226,17 @@ export class HttpFetcher implements ContentFetcher { throw new CancellationError("HTTP fetch cancelled"); } + // Handle 404 Not Found - return special status for refresh operations + if (status === 404) { + logger.debug(`Resource not found (404): ${source}`); + return { + content: Buffer.from(""), + mimeType: "text/plain", + source: source, + status: FetchStatus.NOT_FOUND, + } satisfies RawContent; + } + // Handle redirect errors (status codes 301, 302, 303, 307, 308) if (!followRedirects && status && status >= 300 && status < 400) { const location = axiosError.response?.headers?.location; diff --git a/src/scraper/fetcher/types.ts b/src/scraper/fetcher/types.ts index 1a76393f..3769d752 100644 --- a/src/scraper/fetcher/types.ts +++ b/src/scraper/fetcher/types.ts @@ -1,3 +1,29 @@ +/** + * Semantic status of a fetch operation, abstracting HTTP status codes + * into meaningful states for content processing. + */ +export enum FetchStatus { + /** + * Content was successfully fetched (HTTP 200 or new file). + * The content field will contain the fetched data. + */ + SUCCESS = "success", + + /** + * Content has not been modified since the last fetch (HTTP 304). + * The content field will be empty. Occurs when etag is provided + * in FetchOptions and matches the server's current ETag. + */ + NOT_MODIFIED = "not_modified", + + /** + * The resource was not found (HTTP 404 or file doesn't exist). + * The content field will be empty. In refresh operations, + * this indicates the page should be removed from the index. + */ + NOT_FOUND = "not_found", +} + /** * Raw content fetched from a source before processing. * Includes metadata about the content for proper processing. @@ -20,6 +46,26 @@ export interface RawContent { encoding?: string; /** Original source location */ source: string; + /** + * ETag value for caching purposes. + * For HTTP sources, this comes from the ETag header. + * For local files, this is a hash of the last modified date. + */ + etag?: string; + /** + * Last modified timestamp in ISO8601 format. + * For HTTP sources, this comes from the Last-Modified header. + * For local files, this is the file modification time. + */ + lastModified?: string; + /** + * Semantic status of the fetch operation. + * Abstracts HTTP status codes into meaningful states: + * - SUCCESS: Content was fetched successfully + * - NOT_MODIFIED: Content unchanged since last fetch (conditional request) + * - NOT_FOUND: Resource doesn't exist (should be removed from index) + */ + status: FetchStatus; } /** @@ -38,6 +84,12 @@ export interface FetchOptions { signal?: AbortSignal; /** Whether to follow HTTP redirects (3xx responses) */ followRedirects?: boolean; + /** + * ETag value for conditional requests. + * When provided, the fetcher will include an If-None-Match header + * and may return a 304 Not Modified response if content hasn't changed. + */ + etag?: string | null; } /** diff --git a/src/scraper/middleware/HtmlCheerioParserMiddleware.test.ts b/src/scraper/middleware/HtmlCheerioParserMiddleware.test.ts index 1a0d01ff..67fcae0c 100644 --- a/src/scraper/middleware/HtmlCheerioParserMiddleware.test.ts +++ b/src/scraper/middleware/HtmlCheerioParserMiddleware.test.ts @@ -64,8 +64,8 @@ const createMockContext = ( ): MiddlewareContext => { return { content: htmlContent, + contentType: "text/html", source, - metadata: {}, links: [], errors: [], options: { ...createMockScraperOptions(source), ...options }, diff --git a/src/scraper/middleware/HtmlJsExecutorMiddleware.test.ts b/src/scraper/middleware/HtmlJsExecutorMiddleware.test.ts index deee15d2..f8f7a583 100644 --- a/src/scraper/middleware/HtmlJsExecutorMiddleware.test.ts +++ b/src/scraper/middleware/HtmlJsExecutorMiddleware.test.ts @@ -7,7 +7,7 @@ import { type MockedObject, vi, } from "vitest"; -import type { ContentFetcher, RawContent } from "../fetcher/types"; +import { type ContentFetcher, FetchStatus, type RawContent } from "../fetcher/types"; import type { SandboxExecutionOptions, SandboxExecutionResult } from "../utils/sandbox"; import { executeJsInSandbox } from "../utils/sandbox"; import { HtmlJsExecutorMiddleware } from "./HtmlJsExecutorMiddleware"; @@ -37,7 +37,7 @@ describe("HtmlJsExecutorMiddleware", () => { mockContext = { source: "http://example.com", content: "", // Will be set in tests - metadata: {}, + contentType: "text/html", links: [], errors: [], options: { @@ -136,6 +136,7 @@ describe("HtmlJsExecutorMiddleware", () => { content: Buffer.from(mockScriptContent), mimeType: "application/javascript", source: "http://example.com/ext.js", + status: FetchStatus.SUCCESS, }; mockFetcher.fetch.mockResolvedValue(mockRawContent); @@ -192,6 +193,7 @@ describe("HtmlJsExecutorMiddleware", () => { content: "body { color: red; }", mimeType: "text/css", // Incorrect MIME type source: "http://example.com/style.css", + status: FetchStatus.SUCCESS, }; mockFetcher.fetch.mockResolvedValue(mockRawContent); diff --git a/src/scraper/middleware/HtmlLinkExtractorMiddleware.test.ts b/src/scraper/middleware/HtmlLinkExtractorMiddleware.test.ts index c7f604de..6ad2609c 100644 --- a/src/scraper/middleware/HtmlLinkExtractorMiddleware.test.ts +++ b/src/scraper/middleware/HtmlLinkExtractorMiddleware.test.ts @@ -29,8 +29,8 @@ const createMockContext = ( ): MiddlewareContext => { const context: MiddlewareContext = { content: htmlContent || "", + contentType: "text/html", source, - metadata: {}, links: [], errors: [], options: { ...createMockScraperOptions(source), ...options }, diff --git a/src/scraper/middleware/HtmlMetadataExtractorMiddleware.test.ts b/src/scraper/middleware/HtmlMetadataExtractorMiddleware.test.ts index d8f3fb3b..5b567f22 100644 --- a/src/scraper/middleware/HtmlMetadataExtractorMiddleware.test.ts +++ b/src/scraper/middleware/HtmlMetadataExtractorMiddleware.test.ts @@ -29,8 +29,8 @@ const createMockContext = ( ): MiddlewareContext => { const context: MiddlewareContext = { content: htmlContent || "", + contentType: "text/html", source, - metadata: {}, links: [], errors: [], options: { ...createMockScraperOptions(source), ...options }, @@ -52,7 +52,7 @@ describe("HtmlMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("Head Title"); + expect(context.title).toBe("Head Title"); expect(context.errors).toHaveLength(0); // No need to close Cheerio object @@ -67,7 +67,7 @@ describe("HtmlMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("Untitled"); + expect(context.title).toBe("Untitled"); expect(context.errors).toHaveLength(0); // No need to close Cheerio object @@ -82,7 +82,7 @@ describe("HtmlMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("Untitled"); + expect(context.title).toBe("Untitled"); expect(context.errors).toHaveLength(0); // No need to close Cheerio object @@ -98,7 +98,7 @@ describe("HtmlMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("Extra Whitespace Title"); + expect(context.title).toBe("Extra Whitespace Title"); expect(context.errors).toHaveLength(0); // No need to close Cheerio object @@ -113,7 +113,7 @@ describe("HtmlMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBeUndefined(); // Title should not be set + expect(context.title).toBeUndefined(); // Title should not be set expect(warnSpy).toHaveBeenCalledWith( expect.stringContaining("context.dom is missing"), ); @@ -139,7 +139,7 @@ describe("HtmlMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); // Should still call next - expect(context.metadata.title).toBeUndefined(); + expect(context.title).toBeUndefined(); expect(context.errors).toHaveLength(1); // Check if the error message includes the original error's message expect(context.errors[0].message).toContain("Failed to extract metadata from HTML"); diff --git a/src/scraper/middleware/HtmlMetadataExtractorMiddleware.ts b/src/scraper/middleware/HtmlMetadataExtractorMiddleware.ts index ac40062f..75c725f3 100644 --- a/src/scraper/middleware/HtmlMetadataExtractorMiddleware.ts +++ b/src/scraper/middleware/HtmlMetadataExtractorMiddleware.ts @@ -39,7 +39,7 @@ export class HtmlMetadataExtractorMiddleware implements ContentProcessorMiddlewa // Basic cleanup (replace multiple spaces with single space) title = title.replace(/\s+/g, " ").trim(); - context.metadata.title = title; + context.title = title; logger.debug(`Extracted title: "${title}" from ${context.source}`); } catch (error) { logger.error(`❌ Error extracting metadata from ${context.source}: ${error}`); diff --git a/src/scraper/middleware/HtmlNormalizationMiddleware.test.ts b/src/scraper/middleware/HtmlNormalizationMiddleware.test.ts index 6f6092ba..888e9181 100644 --- a/src/scraper/middleware/HtmlNormalizationMiddleware.test.ts +++ b/src/scraper/middleware/HtmlNormalizationMiddleware.test.ts @@ -19,8 +19,8 @@ describe("HtmlNormalizationMiddleware", () => { }; return { content: htmlContent, + contentType: "text/html", source, - metadata: {}, links: [], errors: [], options, @@ -37,8 +37,8 @@ describe("HtmlNormalizationMiddleware", () => { }; const context: MiddlewareContext = { content: "

test

", + contentType: "text/html", source: "https://example.com", - metadata: {}, links: [], errors: [], options, diff --git a/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts b/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts index cca0caca..fd126b20 100644 --- a/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts +++ b/src/scraper/middleware/HtmlPlaywrightMiddleware.test.ts @@ -52,8 +52,8 @@ const createPipelineTestContext = ( const fullOptions = { ...createMockScraperOptions(source), ...options }; return { content, + contentType: "text/html", source, - metadata: {}, links: [], errors: [], options: fullOptions, diff --git a/src/scraper/middleware/HtmlPlaywrightMiddleware.ts b/src/scraper/middleware/HtmlPlaywrightMiddleware.ts index 5088bad8..2ec1ce76 100644 --- a/src/scraper/middleware/HtmlPlaywrightMiddleware.ts +++ b/src/scraper/middleware/HtmlPlaywrightMiddleware.ts @@ -587,10 +587,7 @@ export class HtmlPlaywrightMiddleware implements ContentProcessorMiddleware { */ async process(context: MiddlewareContext, next: () => Promise): Promise { // Check if we have a MIME type from the raw content and if it's suitable for HTML processing - const contentType = - context.options?.headers?.["content-type"] || - context.metadata?.contentType || - context.metadata?.mimeType; + const contentType = context.options?.headers?.["content-type"] || context.contentType; // Safety check: If we detect this is definitely not HTML content, skip Playwright if ( diff --git a/src/scraper/middleware/HtmlSanitizerMiddleware.test.ts b/src/scraper/middleware/HtmlSanitizerMiddleware.test.ts index 80e00363..e38116e5 100644 --- a/src/scraper/middleware/HtmlSanitizerMiddleware.test.ts +++ b/src/scraper/middleware/HtmlSanitizerMiddleware.test.ts @@ -33,8 +33,8 @@ const createMockContext = ( const fullOptions = { ...createMockScraperOptions(source), ...options }; const context: MiddlewareContext = { content: htmlContent || "", + contentType: "text/html", source, - metadata: {}, links: [], errors: [], options: fullOptions, diff --git a/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts b/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts index 755f106a..391f0022 100644 --- a/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts +++ b/src/scraper/middleware/HtmlToMarkdownMiddleware.test.ts @@ -30,8 +30,8 @@ const createMockContext = ( ): MiddlewareContext => { const context: MiddlewareContext = { content: htmlContent || "", + contentType: "text/html", source, - metadata: {}, links: [], errors: [], options: { ...createMockScraperOptions(source), ...options }, diff --git a/src/scraper/middleware/MarkdownLinkExtractorMiddleware.test.ts b/src/scraper/middleware/MarkdownLinkExtractorMiddleware.test.ts index c8143657..33b485d9 100644 --- a/src/scraper/middleware/MarkdownLinkExtractorMiddleware.test.ts +++ b/src/scraper/middleware/MarkdownLinkExtractorMiddleware.test.ts @@ -28,8 +28,8 @@ const createMockContext = ( ): MiddlewareContext => { return { content: markdownContent, + contentType: "text/markdown", source, - metadata: {}, links: initialLinks, errors: [], options: { ...createMockScraperOptions(source), ...options }, diff --git a/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.test.ts b/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.test.ts index 573e0895..1e4fc8e7 100644 --- a/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.test.ts +++ b/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.test.ts @@ -27,8 +27,8 @@ const createMockContext = ( ): MiddlewareContext => { return { content: markdownContent, + contentType: "text/markdown", source, - metadata: {}, links: [], errors: [], options: { ...createMockScraperOptions(source), ...options }, @@ -45,7 +45,7 @@ describe("MarkdownMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("My Title"); + expect(context.title).toBe("My Title"); expect(context.errors).toHaveLength(0); }); @@ -58,7 +58,7 @@ describe("MarkdownMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("Untitled"); + expect(context.title).toBe("Untitled"); expect(context.errors).toHaveLength(0); }); @@ -71,7 +71,7 @@ describe("MarkdownMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("My Spaced Title"); + expect(context.title).toBe("My Spaced Title"); expect(context.errors).toHaveLength(0); }); @@ -84,7 +84,7 @@ describe("MarkdownMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("First Title"); + expect(context.title).toBe("First Title"); expect(context.errors).toHaveLength(0); }); @@ -97,7 +97,7 @@ describe("MarkdownMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("Untitled"); + expect(context.title).toBe("Untitled"); expect(context.errors).toHaveLength(0); }); @@ -110,7 +110,7 @@ describe("MarkdownMetadataExtractorMiddleware", () => { await middleware.process(context, next); expect(next).toHaveBeenCalledOnce(); - expect(context.metadata.title).toBe("The Actual Title"); + expect(context.title).toBe("The Actual Title"); expect(context.errors).toHaveLength(0); }); diff --git a/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.ts b/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.ts index 55eda6e3..0a0c1f7a 100644 --- a/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.ts +++ b/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.ts @@ -16,7 +16,7 @@ export class MarkdownMetadataExtractorMiddleware implements ContentProcessorMidd if (match?.[1]) { title = match[1].trim(); } - context.metadata.title = title; + context.title = title; } catch (error) { context.errors.push( new Error( diff --git a/src/scraper/middleware/types.ts b/src/scraper/middleware/types.ts index 2eadc375..13571e3d 100644 --- a/src/scraper/middleware/types.ts +++ b/src/scraper/middleware/types.ts @@ -6,12 +6,14 @@ import type { ScraperOptions } from "../types"; * Represents the context passed through the middleware pipeline. */ export interface MiddlewareContext { + /** The title of the page or document, extracted during processing */ + title?: string; + /** The MIME type of the content being processed. */ + contentType: string; /** The content being processed (always a string in middleware). */ content: string; /** The original source URL of the content. */ readonly source: string; - /** Extracted metadata (e.g., title). */ - metadata: Record; /** Extracted links from the content. */ links: string[]; /** Errors encountered during processing. */ diff --git a/src/scraper/pipelines/BasePipeline.test.ts b/src/scraper/pipelines/BasePipeline.test.ts index 656b354a..ce4f7b42 100644 --- a/src/scraper/pipelines/BasePipeline.test.ts +++ b/src/scraper/pipelines/BasePipeline.test.ts @@ -2,7 +2,7 @@ import { describe, expect, it, vi } from "vitest"; import type { ContentProcessorMiddleware, MiddlewareContext } from "../middleware/types"; import { BasePipeline } from "./BasePipeline"; -import type { ProcessedContent } from "./types"; +import type { PipelineResult } from "./types"; // Create a concrete subclass of BasePipeline for testing class TestPipeline extends BasePipeline { @@ -10,8 +10,13 @@ class TestPipeline extends BasePipeline { return true; } - async process(): Promise { - return { textContent: "", metadata: {}, links: [], errors: [], chunks: [] }; + async process(): Promise { + return { + textContent: "", + links: [], + errors: [], + chunks: [], + }; } // Expose the protected method for testing @@ -39,21 +44,21 @@ describe("BasePipeline", () => { // Create mock middleware const middleware1 = { process: vi.fn(async (ctx, next) => { - ctx.metadata.step1 = true; + ctx.title = "Step 1"; await next(); }), }; const middleware2 = { process: vi.fn(async (ctx, next) => { - ctx.metadata.step2 = true; + ctx.title = "Step 2"; await next(); }), }; const middleware3 = { process: vi.fn(async (ctx, next) => { - ctx.metadata.step3 = true; + ctx.title = "Step 3"; await next(); }), }; @@ -67,10 +72,8 @@ describe("BasePipeline", () => { expect(middleware2.process).toHaveBeenCalledTimes(1); expect(middleware3.process).toHaveBeenCalledTimes(1); - // Verify the context was updated by each middleware - expect(context.metadata.step1).toBe(true); - expect(context.metadata.step2).toBe(true); - expect(context.metadata.step3).toBe(true); + // Verify the context was updated by the middleware + expect(context.title).toBe("Step 3"); }); it("executeMiddlewareStack catches errors and adds them to context", async () => { diff --git a/src/scraper/pipelines/BasePipeline.ts b/src/scraper/pipelines/BasePipeline.ts index 420a023b..672c10e9 100644 --- a/src/scraper/pipelines/BasePipeline.ts +++ b/src/scraper/pipelines/BasePipeline.ts @@ -1,7 +1,7 @@ import type { ContentFetcher, RawContent } from "../fetcher/types"; import type { ContentProcessorMiddleware, MiddlewareContext } from "../middleware/types"; import type { ScraperOptions } from "../types"; -import type { ContentPipeline, ProcessedContent } from "./types"; +import type { ContentPipeline, PipelineResult } from "./types"; /** * Base class for content processing pipelines. @@ -9,10 +9,10 @@ import type { ContentPipeline, ProcessedContent } from "./types"; */ export class BasePipeline implements ContentPipeline { /** - * Determines if this pipeline can process the given content. + * Determines if this pipeline can process content with the given MIME type. * Must be implemented by derived classes. */ - public canProcess(_rawContent: RawContent): boolean { + public canProcess(_mimeType: string, _content?: Buffer): boolean { throw new Error("Method not implemented."); } @@ -24,7 +24,7 @@ export class BasePipeline implements ContentPipeline { _rawContent: RawContent, _options: ScraperOptions, _fetcher?: ContentFetcher, - ): Promise { + ): Promise { throw new Error("Method not implemented."); } diff --git a/src/scraper/pipelines/HtmlPipeline.charset.test.ts b/src/scraper/pipelines/HtmlPipeline.charset.test.ts index 2881c6ff..701881ec 100644 --- a/src/scraper/pipelines/HtmlPipeline.charset.test.ts +++ b/src/scraper/pipelines/HtmlPipeline.charset.test.ts @@ -1,5 +1,5 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import { ScrapeMode } from "../types"; import { HtmlPipeline } from "./HtmlPipeline"; @@ -41,6 +41,7 @@ describe("HtmlPipeline charset integration", () => { mimeType: "text/html", charset: "utf-8", // Wrong charset from HTTP header source: "https://example.com/test.html", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(rawContent, { @@ -82,6 +83,7 @@ describe("HtmlPipeline charset integration", () => { mimeType: "text/html", charset: "iso-8859-1", // Correct charset from HTTP header source: "https://example.com/test.html", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(rawContent, { @@ -121,6 +123,7 @@ describe("HtmlPipeline charset integration", () => { mimeType: "text/html", // No charset information source: "https://example.com/test.html", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(rawContent, { diff --git a/src/scraper/pipelines/HtmlPipeline.test.ts b/src/scraper/pipelines/HtmlPipeline.test.ts index cc64b788..e8ee3adf 100644 --- a/src/scraper/pipelines/HtmlPipeline.test.ts +++ b/src/scraper/pipelines/HtmlPipeline.test.ts @@ -1,6 +1,6 @@ // Copyright (c) 2025 import { beforeEach, describe, expect, it, vi } from "vitest"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import { HtmlCheerioParserMiddleware } from "../middleware/HtmlCheerioParserMiddleware"; import { HtmlLinkExtractorMiddleware } from "../middleware/HtmlLinkExtractorMiddleware"; import { HtmlMetadataExtractorMiddleware } from "../middleware/HtmlMetadataExtractorMiddleware"; @@ -21,17 +21,14 @@ describe("HtmlPipeline", () => { it("canProcess returns true for text/html", () => { const pipeline = new HtmlPipeline(); - expect(pipeline.canProcess({ mimeType: "text/html" } as RawContent)).toBe(true); - expect(pipeline.canProcess({ mimeType: "application/xhtml+xml" } as RawContent)).toBe( - true, - ); + expect(pipeline.canProcess("text/html")).toBe(true); + expect(pipeline.canProcess("application/xhtml+xml")).toBe(true); }); it("canProcess returns false for non-html", () => { const pipeline = new HtmlPipeline(); - expect(pipeline.canProcess({ mimeType: "text/markdown" } as RawContent)).toBe(false); - // @ts-expect-error - expect(pipeline.canProcess({ mimeType: undefined } as RawContent)).toBe(false); + expect(pipeline.canProcess("text/markdown")).toBe(false); + expect(pipeline.canProcess("")).toBe(false); }); it("process decodes Buffer content with UTF-8 charset", async () => { @@ -41,6 +38,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); // Check that we got some markdown content (exact format depends on the actual middleware) @@ -68,6 +66,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "iso-8859-1", // Explicitly set charset to ISO-8859-1 source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); @@ -86,6 +85,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", // No charset specified source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); // Check that we got some markdown content (exact format depends on the actual middleware) @@ -100,6 +100,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); // Check that we got some markdown content (exact format depends on the actual middleware) @@ -116,6 +117,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-16le", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("abc"); @@ -130,6 +132,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("abc"); @@ -143,6 +146,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("こんにちは世界"); @@ -156,6 +160,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("Привет, мир"); @@ -178,6 +183,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); @@ -189,7 +195,7 @@ describe("HtmlPipeline", () => { expect(HtmlToMarkdownMiddleware.prototype.process).toHaveBeenCalledTimes(1); // Verify the result contains expected data from the actual middleware - expect(result.metadata.title).toBe("Test Title"); + expect(result.title).toBe("Test Title"); expect(result.links).toContain("https://test.link/"); expect(result.textContent).toBeTruthy(); expect(result.textContent).toEqual("This is a [test link](https://test.link/)."); @@ -210,9 +216,10 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); - expect(result.errors.some((e) => e.message === "fail")).toBe(true); + expect(result.errors?.some((e) => e.message === "fail")).toBe(true); }); it("should correctly process HTML through the full standard middleware stack (E2E with spies)", async () => { @@ -242,6 +249,7 @@ describe("HtmlPipeline", () => { mimeType: "text/html", charset: "utf-8", source: "http://test.example.com", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, { @@ -260,7 +268,7 @@ describe("HtmlPipeline", () => { // Verify the result contains expected data // The exact values will depend on the actual middleware implementations - expect(result.metadata.title).toBe("Test Page"); + expect(result.title).toBe("Test Page"); expect(result.links).toContain("https://example.com/test/link"); // Verify the content was sanitized (no script tags) and converted to markdown diff --git a/src/scraper/pipelines/HtmlPipeline.ts b/src/scraper/pipelines/HtmlPipeline.ts index b691c068..9deb9ee9 100644 --- a/src/scraper/pipelines/HtmlPipeline.ts +++ b/src/scraper/pipelines/HtmlPipeline.ts @@ -18,7 +18,7 @@ import type { ScraperOptions } from "../types"; import { convertToString } from "../utils/buffer"; import { resolveCharset } from "../utils/charset"; import { BasePipeline } from "./BasePipeline"; -import type { ProcessedContent } from "./types"; +import type { PipelineResult } from "./types"; /** * Pipeline for processing HTML content using middleware and semantic splitting with size optimization. @@ -57,15 +57,15 @@ export class HtmlPipeline extends BasePipeline { ); } - canProcess(rawContent: RawContent): boolean { - return MimeTypeUtils.isHtml(rawContent.mimeType); + canProcess(mimeType: string): boolean { + return MimeTypeUtils.isHtml(mimeType); } async process( rawContent: RawContent, options: ScraperOptions, fetcher?: ContentFetcher, - ): Promise { + ): Promise { // Use enhanced charset detection that considers HTML meta tags const resolvedCharset = resolveCharset( rawContent.charset, @@ -76,8 +76,9 @@ export class HtmlPipeline extends BasePipeline { const context: MiddlewareContext = { content: contentString, + contentType: rawContent.mimeType || "text/html", source: rawContent.source, - metadata: {}, + // metadata: {}, links: [], errors: [], options, @@ -99,8 +100,8 @@ export class HtmlPipeline extends BasePipeline { ); return { - textContent: typeof context.content === "string" ? context.content : "", - metadata: context.metadata, + title: context.title, + textContent: context.content, links: context.links, errors: context.errors, chunks, diff --git a/src/scraper/pipelines/JsonPipeline.test.ts b/src/scraper/pipelines/JsonPipeline.test.ts index f9bf9a96..4deb0993 100644 --- a/src/scraper/pipelines/JsonPipeline.test.ts +++ b/src/scraper/pipelines/JsonPipeline.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from "vitest"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import { JsonPipeline } from "./JsonPipeline"; describe("JsonPipeline", () => { @@ -16,47 +16,23 @@ describe("JsonPipeline", () => { describe("canProcess", () => { it("should accept JSON MIME types", () => { - const jsonContent: RawContent = { - content: "{}", - mimeType: "application/json", - charset: "utf-8", - source: "test.json", - }; - - expect(pipeline.canProcess(jsonContent)).toBe(true); + const pipeline = new JsonPipeline(); + expect(pipeline.canProcess("application/json")).toBe(true); }); it("should accept text/json MIME type", () => { - const jsonContent: RawContent = { - content: "{}", - mimeType: "text/json", - charset: "utf-8", - source: "test.json", - }; - - expect(pipeline.canProcess(jsonContent)).toBe(true); + const pipeline = new JsonPipeline(); + expect(pipeline.canProcess("text/json")).toBe(true); }); it("should reject non-JSON MIME types", () => { - const htmlContent: RawContent = { - content: "", - mimeType: "text/html", - charset: "utf-8", - source: "test.html", - }; - - expect(pipeline.canProcess(htmlContent)).toBe(false); + const pipeline = new JsonPipeline(); + expect(pipeline.canProcess("text/html")).toBe(false); }); it("should reject content without MIME type", () => { - const unknownContent: RawContent = { - content: "{}", - mimeType: "", - charset: "utf-8", - source: "test", - }; - - expect(pipeline.canProcess(unknownContent)).toBe(false); + const pipeline = new JsonPipeline(); + expect(pipeline.canProcess("")).toBe(false); }); }); @@ -67,19 +43,20 @@ describe("JsonPipeline", () => { mimeType: "application/json", charset: "utf-8", source: "user.json", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(jsonContent, baseOptions); expect(result.textContent).toBe(jsonContent.content); - expect(result.metadata.title).toBe("John"); // extracted from name field - expect(result.metadata.description).toBeUndefined(); // no description field found - expect(result.metadata.isValidJson).toBe(true); - expect(result.metadata.jsonStructure).toEqual({ - type: "object", - depth: 1, - propertyCount: 2, - }); + expect(result.title).toBe("John"); // extracted from name field + // expect(result.metadata.description).toBeUndefined(); // no description field found + // expect(result.metadata.isValidJson).toBe(true); + // expect(result.metadata.jsonStructure).toEqual({ + // type: "object", + // depth: 1, + // propertyCount: 2, + // }); expect(result.links).toHaveLength(0); expect(result.errors).toHaveLength(0); }); @@ -90,19 +67,20 @@ describe("JsonPipeline", () => { mimeType: "application/json", charset: "utf-8", source: "numbers.json", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(jsonContent, baseOptions); expect(result.textContent).toBe(jsonContent.content); - expect(result.metadata.title).toBeUndefined(); // no title field in array - expect(result.metadata.description).toBeUndefined(); // no description field in array - expect(result.metadata.isValidJson).toBe(true); - expect(result.metadata.jsonStructure).toEqual({ - type: "array", - depth: 1, - itemCount: 3, - }); + expect(result.title).toBeUndefined(); // no title field in array + // expect(result.metadata.description).toBeUndefined(); // no description field in array + // expect(result.metadata.isValidJson).toBe(true); + // expect(result.metadata.jsonStructure).toEqual({ + // type: "array", + // depth: 1, + // itemCount: 3, + // }); }); it("should extract title from JSON properties", async () => { @@ -119,12 +97,13 @@ describe("JsonPipeline", () => { mimeType: "application/json", charset: "utf-8", source: "api.json", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(jsonContent, baseOptions); - expect(result.metadata.title).toBe("My API Documentation"); - expect(result.metadata.description).toBe("REST API for user management"); + expect(result.title).toBe("My API Documentation"); + // expect(result.metadata.description).toBe("REST API for user management"); }); it("should handle nested JSON structures", async () => { @@ -147,15 +126,16 @@ describe("JsonPipeline", () => { mimeType: "application/json", charset: "utf-8", source: "nested.json", + status: FetchStatus.SUCCESS, }; - const result = await pipeline.process(jsonContent, baseOptions); + const _result = await pipeline.process(jsonContent, baseOptions); - expect(result.metadata.jsonStructure).toEqual({ - type: "object", - depth: 4, // user -> profile -> personal -> name/age - propertyCount: 2, // user, settings - }); + // expect(result.metadata.jsonStructure).toEqual({ + // type: "object", + // depth: 4, // user -> profile -> personal -> name/age + // propertyCount: 2, // user, settings + // }); }); it("should handle invalid JSON gracefully", async () => { @@ -164,15 +144,16 @@ describe("JsonPipeline", () => { mimeType: "application/json", charset: "utf-8", source: "invalid.json", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(jsonContent, baseOptions); expect(result.textContent).toBe(jsonContent.content); - expect(result.metadata.title).toBeUndefined(); // no title/description fields for invalid JSON - expect(result.metadata.description).toBeUndefined(); - expect(result.metadata.isValidJson).toBe(false); - expect(result.metadata.jsonStructure).toBeUndefined(); + expect(result.title).toBeUndefined(); // no title/description fields for invalid JSON + // expect(result.metadata.description).toBeUndefined(); + // expect(result.metadata.isValidJson).toBe(false); + // expect(result.metadata.jsonStructure).toBeUndefined(); }); it("should handle JSON primitives", async () => { @@ -181,16 +162,17 @@ describe("JsonPipeline", () => { mimeType: "application/json", charset: "utf-8", source: "string.json", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(stringContent, baseOptions); - expect(result.metadata.title).toBeUndefined(); // no title field in primitive - expect(result.metadata.description).toBeUndefined(); // no description field in primitive - expect(result.metadata.jsonStructure).toEqual({ - type: "string", - depth: 1, - }); + expect(result.title).toBeUndefined(); // no title field in primitive + // expect(result.metadata.description).toBeUndefined(); // no description field in primitive + // expect(result.metadata.jsonStructure).toEqual({ + // type: "string", + // depth: 1, + // }); }); it("should handle empty JSON structures", async () => { @@ -199,17 +181,18 @@ describe("JsonPipeline", () => { mimeType: "application/json", charset: "utf-8", source: "empty.json", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(emptyObjectContent, baseOptions); - expect(result.metadata.title).toBeUndefined(); // no title field in empty object - expect(result.metadata.description).toBeUndefined(); // no description field in empty object - expect(result.metadata.jsonStructure).toEqual({ - type: "object", - depth: 1, - propertyCount: 0, - }); + expect(result.title).toBeUndefined(); // no title field in empty object + // expect(result.metadata.description).toBeUndefined(); // no description field in empty object + // expect(result.metadata.jsonStructure).toEqual({ + // type: "object", + // depth: 1, + // propertyCount: 0, + // }); }); it("should handle Buffer content", async () => { @@ -219,12 +202,13 @@ describe("JsonPipeline", () => { mimeType: "application/json", charset: "utf-8", source: "buffer.json", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(jsonContent, baseOptions); expect(result.textContent).toBe(jsonString); - expect(result.metadata.isValidJson).toBe(true); + // expect(result.metadata.isValidJson).toBe(true); }); }); }); diff --git a/src/scraper/pipelines/JsonPipeline.ts b/src/scraper/pipelines/JsonPipeline.ts index e8c984bd..c3abe6a0 100644 --- a/src/scraper/pipelines/JsonPipeline.ts +++ b/src/scraper/pipelines/JsonPipeline.ts @@ -7,7 +7,7 @@ import type { ContentProcessorMiddleware, MiddlewareContext } from "../middlewar import type { ScraperOptions } from "../types"; import { convertToString } from "../utils/buffer"; import { BasePipeline } from "./BasePipeline"; -import type { ProcessedContent } from "./types"; +import type { PipelineResult } from "./types"; /** * Pipeline for processing JSON content with semantic, hierarchical splitting. @@ -28,16 +28,16 @@ export class JsonPipeline extends BasePipeline { }); } - canProcess(rawContent: RawContent): boolean { - if (!rawContent.mimeType) return false; - return MimeTypeUtils.isJson(rawContent.mimeType); + canProcess(mimeType: string): boolean { + if (!mimeType) return false; + return MimeTypeUtils.isJson(mimeType); } async process( rawContent: RawContent, options: ScraperOptions, fetcher?: ContentFetcher, - ): Promise { + ): Promise { const contentString = convertToString(rawContent.content, rawContent.charset); // Validate JSON structure @@ -55,23 +55,26 @@ export class JsonPipeline extends BasePipeline { const fallbackChunks = await this.splitter.splitText(contentString); return { textContent: contentString, - metadata: { - isValidJson: false, - }, + // metadata: { + // isValidJson: false, + // }, links: [], errors: [], chunks: fallbackChunks, }; } + const metadata = this.extractMetadata(parsedJson); const context: MiddlewareContext = { content: contentString, source: rawContent.source, - metadata: { - ...this.extractMetadata(parsedJson), - isValidJson, - jsonStructure: this.analyzeJsonStructure(parsedJson), - }, + title: metadata.title, + contentType: rawContent.mimeType || "application/json", + // metadata: { + // ...this.extractMetadata(parsedJson), + // isValidJson, + // jsonStructure: this.analyzeJsonStructure(parsedJson), + // }, links: [], // JSON files typically don't contain links errors: [], options, @@ -85,8 +88,8 @@ export class JsonPipeline extends BasePipeline { const chunks = await this.splitter.splitText(context.content); return { + title: context.title, textContent: context.content, - metadata: context.metadata, links: context.links, errors: context.errors, chunks, @@ -124,36 +127,6 @@ export class JsonPipeline extends BasePipeline { return metadata; } - /** - * Analyzes the structure of valid JSON for metadata - */ - private analyzeJsonStructure(parsedJson: unknown): { - type: string; - depth: number; - itemCount?: number; - propertyCount?: number; - } { - if (Array.isArray(parsedJson)) { - return { - type: "array", - depth: this.calculateDepth(parsedJson), - itemCount: parsedJson.length, - }; - } else if (typeof parsedJson === "object" && parsedJson !== null) { - const obj = parsedJson as Record; - return { - type: "object", - depth: this.calculateDepth(parsedJson), - propertyCount: Object.keys(obj).length, - }; - } else { - return { - type: typeof parsedJson, - depth: 1, - }; - } - } - /** * Calculates the maximum nesting depth of a JSON structure */ diff --git a/src/scraper/pipelines/MarkdownPipeline.test.ts b/src/scraper/pipelines/MarkdownPipeline.test.ts index 8f9cb3b2..9192a5ae 100644 --- a/src/scraper/pipelines/MarkdownPipeline.test.ts +++ b/src/scraper/pipelines/MarkdownPipeline.test.ts @@ -1,6 +1,6 @@ // Copyright (c) 2025 import { beforeEach, describe, expect, it, vi } from "vitest"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import { MarkdownLinkExtractorMiddleware } from "../middleware/MarkdownLinkExtractorMiddleware"; import { MarkdownMetadataExtractorMiddleware } from "../middleware/MarkdownMetadataExtractorMiddleware"; import { ScrapeMode, type ScraperOptions } from "../types"; @@ -15,18 +15,15 @@ describe("MarkdownPipeline", () => { it("canProcess returns true for text/markdown", () => { const pipeline = new MarkdownPipeline(); - expect(pipeline.canProcess({ mimeType: "text/markdown" } as RawContent)).toBe(true); - expect(pipeline.canProcess({ mimeType: "text/x-markdown" } as RawContent)).toBe(true); + expect(pipeline.canProcess("text/markdown")).toBe(true); + expect(pipeline.canProcess("text/x-markdown")).toBe(true); }); // MarkdownPipeline now processes all text/* types as markdown, including text/html. it("canProcess returns false for non-text types", () => { const pipeline = new MarkdownPipeline(); - expect(pipeline.canProcess({ mimeType: "application/json" } as RawContent)).toBe( - false, - ); - // @ts-expect-error - expect(pipeline.canProcess({ mimeType: undefined } as RawContent)).toBe(false); + expect(pipeline.canProcess("application/json")).toBe(false); + expect(pipeline.canProcess("")).toBe(false); }); it("process decodes Buffer content with UTF-8 charset", async () => { @@ -36,6 +33,7 @@ describe("MarkdownPipeline", () => { mimeType: "text/markdown", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toBe("# Header\n\nThis is a test."); @@ -67,6 +65,7 @@ describe("MarkdownPipeline", () => { mimeType: "text/markdown", charset: "iso-8859-1", // Explicitly set charset to ISO-8859-1 source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toBe("# Café"); @@ -82,6 +81,7 @@ describe("MarkdownPipeline", () => { mimeType: "text/markdown", // No charset specified source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toBe("# Default UTF-8\n\nContent"); @@ -94,6 +94,7 @@ describe("MarkdownPipeline", () => { mimeType: "text/markdown", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toBe( @@ -118,6 +119,7 @@ This is a paragraph with a [link](https://test.example.com). mimeType: "text/markdown", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); @@ -148,9 +150,10 @@ This is a paragraph with a [link](https://test.example.com). mimeType: "text/markdown", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); - expect(result.errors.some((e) => e.message === "fail")).toBe(true); + expect(result.errors?.some((e) => e.message === "fail")).toBe(true); }); it("process decodes Buffer content with UTF-16LE BOM", async () => { @@ -169,6 +172,7 @@ This is a paragraph with a [link](https://test.example.com). mimeType: "text/markdown", charset: "utf-16le", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("# Café"); @@ -184,6 +188,7 @@ This is a paragraph with a [link](https://test.example.com). mimeType: "text/markdown", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("# Café"); @@ -197,6 +202,7 @@ This is a paragraph with a [link](https://test.example.com). mimeType: "text/markdown", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("こんにちは世界"); @@ -210,6 +216,7 @@ This is a paragraph with a [link](https://test.example.com). mimeType: "text/markdown", charset: "utf-8", source: "http://test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, {} as ScraperOptions); expect(result.textContent).toContain("Привет, мир"); @@ -241,6 +248,7 @@ More content here. mimeType: "text/markdown", charset: "utf-8", source: "http://test.example.com", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, { @@ -288,6 +296,7 @@ Final content in section B.`; mimeType: "text/markdown", charset: "utf-8", source: "http://test.example.com", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, { @@ -298,14 +307,14 @@ Final content in section B.`; }); // Verify we got chunks with proper hierarchy - expect(result.chunks.length).toBeGreaterThan(0); + expect(result.chunks?.length).toBeGreaterThan(0); // GreedySplitter may merge small content into fewer chunks // But the hierarchy structure should still be semantically meaningful - expect(result.chunks.length).toBeGreaterThanOrEqual(1); + expect(result.chunks?.length).toBeGreaterThanOrEqual(1); // Check that all chunks have valid hierarchy metadata - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.section).toBeDefined(); expect(typeof chunk.section.level).toBe("number"); expect(Array.isArray(chunk.section.path)).toBe(true); @@ -313,8 +322,8 @@ Final content in section B.`; }); // Verify that headings and text are properly identified - const hasHeadings = result.chunks.some((chunk) => chunk.types.includes("heading")); - const hasText = result.chunks.some((chunk) => chunk.types.includes("text")); + const hasHeadings = result.chunks?.some((chunk) => chunk.types.includes("heading")); + const hasText = result.chunks?.some((chunk) => chunk.types.includes("text")); expect(hasHeadings || hasText).toBe(true); // Should have semantic content }); @@ -336,6 +345,7 @@ Content under second level.`; mimeType: "text/markdown", charset: "utf-8", source: "http://test.example.com", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, { @@ -346,7 +356,7 @@ Content under second level.`; }); // Should not create separate whitespace-only chunks at level 0 - const whitespaceOnlyChunks = result.chunks.filter( + const whitespaceOnlyChunks = result.chunks?.filter( (chunk) => chunk.section.level === 0 && chunk.section.path.length === 0 && @@ -355,7 +365,7 @@ Content under second level.`; expect(whitespaceOnlyChunks).toHaveLength(0); // First heading should be at level 1, not degraded by whitespace - const firstHeading = result.chunks.find( + const firstHeading = result.chunks?.find( (chunk) => chunk.types.includes("heading") && chunk.content.includes("First Heading"), ); @@ -363,7 +373,7 @@ Content under second level.`; expect(firstHeading!.section.level).toBe(1); // Minimum level should be 1 (not degraded to 0 by GreedySplitter) - const minLevel = Math.min(...result.chunks.map((c) => c.section.level)); + const minLevel = Math.min(...result.chunks!.map((c) => c.section.level)); expect(minLevel).toBe(1); }); @@ -395,6 +405,7 @@ ${longContent}`; mimeType: "text/markdown", charset: "utf-8", source: "http://test.example.com", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, { @@ -405,15 +416,15 @@ ${longContent}`; }); // Should have multiple chunks due to size constraints - expect(result.chunks.length).toBeGreaterThan(1); + expect(result.chunks?.length).toBeGreaterThan(1); // All chunks should be within size limits - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(100); }); // Should maintain hierarchy levels (not degrade to 0) - const minLevel = Math.min(...result.chunks.map((c) => c.section.level)); + const minLevel = Math.min(...result.chunks!.map((c) => c.section.level)); expect(minLevel).toBeGreaterThanOrEqual(1); }); @@ -440,6 +451,7 @@ More details here.`; mimeType: "text/markdown", charset: "utf-8", source: "http://test.example.com", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, { @@ -450,14 +462,14 @@ More details here.`; }); // Verify we have content with semantic types (GreedySplitter may merge them) - expect(result.chunks.length).toBeGreaterThan(0); + expect(result.chunks?.length).toBeGreaterThan(0); // Check that we have the expected content types somewhere in the chunks - const allTypes = new Set(result.chunks.flatMap((chunk) => chunk.types)); + const allTypes = new Set(result.chunks?.flatMap((chunk) => chunk.types)); expect(allTypes.has("heading") || allTypes.has("text")).toBe(true); // Verify all chunks have proper section metadata - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.section).toBeDefined(); expect(typeof chunk.section.level).toBe("number"); expect(Array.isArray(chunk.section.path)).toBe(true); @@ -465,7 +477,7 @@ More details here.`; }); // Verify content is preserved (at least the key parts) - const allContent = result.chunks.map((chunk) => chunk.content).join(""); + const allContent = result.chunks?.map((chunk) => chunk.content).join(""); expect(allContent).toContain("Documentation"); expect(allContent).toContain("Implementation"); expect(allContent).toContain("Hello, world!"); @@ -487,6 +499,7 @@ Final paragraph.`; mimeType: "text/markdown", charset: "utf-8", source: "http://test.example.com", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(raw, { @@ -497,7 +510,7 @@ Final paragraph.`; }); // Verify semantic content is preserved (may not be perfect reconstruction due to whitespace normalization) - const allContent = result.chunks.map((chunk) => chunk.content).join(""); + const allContent = result.chunks?.map((chunk) => chunk.content).join(""); expect(allContent).toContain("# Title"); expect(allContent).toContain("## Subtitle"); expect(allContent).toContain("Paragraph with text"); @@ -506,10 +519,10 @@ Final paragraph.`; expect(allContent).toContain("Final paragraph"); // Verify we have semantic chunks - expect(result.chunks.length).toBeGreaterThan(0); + expect(result.chunks?.length).toBeGreaterThan(0); // Verify hierarchical structure is preserved - const minLevel = Math.min(...result.chunks.map((chunk) => chunk.section.level)); + const minLevel = Math.min(...result.chunks!.map((chunk) => chunk.section.level)); expect(minLevel).toBeGreaterThanOrEqual(1); // Should not degrade to 0 }); }); diff --git a/src/scraper/pipelines/MarkdownPipeline.ts b/src/scraper/pipelines/MarkdownPipeline.ts index d856eed0..e3302392 100644 --- a/src/scraper/pipelines/MarkdownPipeline.ts +++ b/src/scraper/pipelines/MarkdownPipeline.ts @@ -12,7 +12,7 @@ import type { ContentProcessorMiddleware, MiddlewareContext } from "../middlewar import type { ScraperOptions } from "../types"; import { convertToString } from "../utils/buffer"; import { BasePipeline } from "./BasePipeline"; -import type { ProcessedContent } from "./types"; +import type { PipelineResult } from "./types"; /** * Pipeline for processing Markdown content using middleware and semantic splitting with size optimization. @@ -45,22 +45,22 @@ export class MarkdownPipeline extends BasePipeline { ); } - canProcess(rawContent: RawContent): boolean { - if (!rawContent.mimeType) return false; - return MimeTypeUtils.isMarkdown(rawContent.mimeType); + canProcess(mimeType: string): boolean { + if (!mimeType) return false; + return MimeTypeUtils.isMarkdown(mimeType); } async process( rawContent: RawContent, options: ScraperOptions, fetcher?: ContentFetcher, - ): Promise { + ): Promise { const contentString = convertToString(rawContent.content, rawContent.charset); const context: MiddlewareContext = { + contentType: rawContent.mimeType || "text/markdown", content: contentString, source: rawContent.source, - metadata: {}, links: [], errors: [], options, @@ -77,8 +77,8 @@ export class MarkdownPipeline extends BasePipeline { ); return { + title: context.title, textContent: typeof context.content === "string" ? context.content : "", - metadata: context.metadata, links: context.links, errors: context.errors, chunks, diff --git a/src/scraper/pipelines/PipelineFactory.integration.test.ts b/src/scraper/pipelines/PipelineFactory.integration.test.ts index bbd4e2fd..67adb11f 100644 --- a/src/scraper/pipelines/PipelineFactory.integration.test.ts +++ b/src/scraper/pipelines/PipelineFactory.integration.test.ts @@ -1,4 +1,5 @@ import { describe, expect, it } from "vitest"; +import { FetchStatus, type RawContent } from "../fetcher"; import { ScrapeMode } from "../types"; import { type PipelineConfiguration, PipelineFactory } from "./PipelineFactory"; @@ -23,10 +24,11 @@ describe("PipelineFactory Integration", () => { const textPipeline = pipelines[4]; // TextPipeline // Create mock RawContent for the process method - const rawContent = { + const rawContent: RawContent = { source: "test.txt", content: longContent, mimeType: "text/plain", + status: FetchStatus.SUCCESS, }; const scraperOptions = { @@ -42,8 +44,8 @@ describe("PipelineFactory Integration", () => { // Verify that chunks are smaller due to custom configuration // With 570 characters and 100 char preferred size, should be multiple chunks - expect(processed.chunks.length).toBeGreaterThan(1); // Should be split into multiple chunks - processed.chunks.forEach((chunk) => { + expect(processed.chunks?.length).toBeGreaterThan(1); // Should be split into multiple chunks + processed.chunks?.forEach((chunk) => { expect(chunk.content.length).toBeGreaterThan(0); // Should be much smaller than default 1500 expect(chunk.content.length).toBeLessThan(300); @@ -59,10 +61,11 @@ describe("PipelineFactory Integration", () => { // Test with TextPipeline const textPipeline = pipelines[4]; - const rawContent = { + const rawContent: RawContent = { source: "test.txt", content: moderateContent, mimeType: "text/plain", + status: FetchStatus.SUCCESS, }; const scraperOptions = { @@ -77,8 +80,8 @@ describe("PipelineFactory Integration", () => { const processed = await textPipeline.process(rawContent, scraperOptions); // With default chunk size (1500), this should fit in one chunk - expect(processed.chunks.length).toBe(1); - expect(processed.chunks[0].content.length).toBeLessThan(300); + expect(processed.chunks?.length).toBe(1); + expect(processed.chunks?.[0].content?.length).toBeLessThan(300); }); it("should handle different pipeline types with custom configuration", async () => { @@ -95,10 +98,11 @@ describe("PipelineFactory Integration", () => { const testContent = "This is a test content that might be split. ".repeat(10); // ~450 characters for (const pipeline of pipelines) { - const rawContent = { + const rawContent: RawContent = { source: "test.txt", content: testContent, mimeType: "text/plain", + status: FetchStatus.SUCCESS, }; const scraperOptions = { @@ -111,10 +115,10 @@ describe("PipelineFactory Integration", () => { }; const processed = await pipeline.process(rawContent, scraperOptions); - expect(processed.chunks.length).toBeGreaterThanOrEqual(1); + expect(processed.chunks?.length).toBeGreaterThanOrEqual(1); // Verify each chunk respects the configuration - processed.chunks.forEach((chunk) => { + processed.chunks?.forEach((chunk) => { expect(chunk.content.length).toBeGreaterThan(0); // Allow some flexibility for splitting logic, but ensure it's not wildly large expect(chunk.content.length).toBeLessThan(800); @@ -140,15 +144,17 @@ describe("PipelineFactory Integration", () => { chunkSizes: { preferred: 80, max: 150 }, }); - const rawContent = { + const rawContent: RawContent = { source: "test", content, mimeType, + status: FetchStatus.SUCCESS, }; // Find the first pipeline that can process this content + const contentBuffer = Buffer.from(content); for (const pipeline of pipelines) { - if (pipeline.canProcess(rawContent)) { + if (pipeline.canProcess(mimeType, contentBuffer)) { return await pipeline.process(rawContent, baseOptions); } } @@ -171,19 +177,17 @@ describe("PipelineFactory Integration", () => { const result = await processContent(htmlContent, "text/html"); // HTML should be converted to markdown and create hierarchical structure - expect(result.chunks.length).toBeGreaterThan(1); + expect(result.chunks?.length).toBeGreaterThan(1); // Should have chunks with heading-based hierarchy - const headingChunks = result.chunks.filter( + const headingChunks = result.chunks?.filter( (chunk) => chunk.types.includes("heading") || chunk.section.path.length > 0, ); - expect(headingChunks.length).toBeGreaterThan(0); + expect(headingChunks?.length).toBeGreaterThan(0); // Should convert table to markdown format - const tableChunks = result.chunks.filter((chunk) => chunk.types.includes("table")); - if (tableChunks.length > 0) { - expect(tableChunks[0].content).toMatch(/\|.*\|/); // Markdown table format - } + const tableChunks = result.chunks?.filter((chunk) => chunk.types.includes("table")); + expect(tableChunks?.[0].content).toMatch(/\|.*\|/); // Markdown table format }); it("should process JavaScript/TypeScript with semantic code boundaries", async () => { @@ -209,18 +213,18 @@ describe("PipelineFactory Integration", () => { const result = await processContent(jsContent, "application/javascript"); // Should split along semantic boundaries (functions, classes) - expect(result.chunks.length).toBeGreaterThan(1); + expect(result.chunks?.length).toBeGreaterThan(1); // Should preserve code structure and formatting - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.types).toContain("code"); // All chunks should have content (including whitespace for perfect reconstruction) expect(chunk.content.length).toBeGreaterThan(0); }); // Should maintain perfect reconstruction - const reconstructed = result.chunks.map((chunk) => chunk.content).join(""); - expect(reconstructed.trim()).toBe(jsContent.trim()); + const reconstructed = result.chunks?.map((chunk) => chunk.content).join(""); + expect(reconstructed?.trim()).toBe(jsContent.trim()); expect(reconstructed).toContain("add(a, b)"); expect(reconstructed).toContain("multiply(a, b)"); expect(reconstructed).toContain('greet("World")'); @@ -256,14 +260,14 @@ describe("PipelineFactory Integration", () => { const result = await processContent(jsonContent, "application/json"); // Should handle JSON structure appropriately - expect(result.chunks.length).toBeGreaterThanOrEqual(1); + expect(result.chunks?.length).toBeGreaterThanOrEqual(1); // Should preserve JSON formatting and structure - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.content.trim()).not.toBe(""); // JSON chunks should be valid when reconstructed - const reconstructed = result.chunks.map((c) => c.content).join(""); - expect(() => JSON.parse(reconstructed)).not.toThrow(); + const reconstructed = result.chunks?.map((c) => c.content).join(""); + expect(() => JSON.parse(reconstructed || "")).not.toThrow(); }); }); @@ -298,24 +302,24 @@ More detailed content here. const result = await processContent(markdownContent, "text/markdown"); // Should create multiple chunks with different content types - expect(result.chunks.length).toBeGreaterThan(3); + expect(result.chunks?.length).toBeGreaterThan(3); // Should distinguish between content types - const contentTypes = new Set(result.chunks.flatMap((chunk) => chunk.types)); + const contentTypes = new Set(result.chunks?.flatMap((chunk) => chunk.types)); expect(contentTypes.size).toBeGreaterThan(1); // Should have multiple content types // Should create hierarchical paths based on headings - const hierarchicalChunks = result.chunks.filter( + const hierarchicalChunks = result.chunks?.filter( (chunk) => chunk.section.path.length > 0, ); - expect(hierarchicalChunks.length).toBeGreaterThan(0); + expect(hierarchicalChunks?.length).toBeGreaterThan(0); // Should preserve markdown structure - const codeChunks = result.chunks.filter((chunk) => chunk.types.includes("code")); - const tableChunks = result.chunks.filter((chunk) => chunk.types.includes("table")); + const codeChunks = result.chunks?.filter((chunk) => chunk.types.includes("code")); + const tableChunks = result.chunks?.filter((chunk) => chunk.types.includes("table")); - expect(codeChunks.length).toBeGreaterThan(0); - expect(tableChunks.length).toBeGreaterThan(0); + expect(codeChunks?.length).toBeGreaterThan(0); + expect(tableChunks?.length).toBeGreaterThan(0); }); it("should process plain text with simple structure and no hierarchy", async () => { @@ -332,18 +336,18 @@ Final paragraph here. const result = await processContent(textContent, "text/plain"); // Should split into chunks but maintain simplicity - expect(result.chunks.length).toBeGreaterThanOrEqual(1); + expect(result.chunks?.length).toBeGreaterThanOrEqual(1); // All chunks should be text type with no hierarchy - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.types).toEqual(["text"]); expect(chunk.section.path).toEqual([]); // No hierarchical structure expect(chunk.section.level).toBe(0); }); // Should preserve content exactly - const reconstructed = result.chunks.map((chunk) => chunk.content).join(""); - expect(reconstructed.trim()).toBe(textContent.trim()); + const reconstructed = result.chunks?.map((chunk) => chunk.content).join(""); + expect(reconstructed?.trim()).toBe(textContent.trim()); }); }); @@ -377,29 +381,33 @@ Content for section one that is longer than the chunk size limit. More content for section two that also exceeds the small limit. `; - const rawContent = { + const rawContent: RawContent = { source: "test.md", content: markdownContent, mimeType: "text/markdown", + status: FetchStatus.SUCCESS, }; // Find markdown pipeline - const markdownPipeline = pipelines.find((p) => p.canProcess(rawContent)); + const contentBuffer = Buffer.from(markdownContent); + const markdownPipeline = pipelines.find((p) => + p.canProcess(rawContent.mimeType, contentBuffer), + ); expect(markdownPipeline).toBeDefined(); const result = await markdownPipeline!.process(rawContent, baseOptions); // Even with small chunk size, should maintain semantic structure - const headingChunks = result.chunks.filter((chunk) => + const headingChunks = result.chunks?.filter((chunk) => chunk.types.includes("heading"), ); - expect(headingChunks.length).toBeGreaterThan(0); + expect(headingChunks?.length).toBeGreaterThan(0); // Should still create proper hierarchy despite size constraints - const hierarchicalChunks = result.chunks.filter( + const hierarchicalChunks = result.chunks?.filter( (chunk) => chunk.section.path.length > 0, ); - expect(hierarchicalChunks.length).toBeGreaterThan(0); + expect(hierarchicalChunks?.length).toBeGreaterThan(0); }); it("should preserve logical units in code even with large chunk sizes", async () => { @@ -423,23 +431,27 @@ class MyClass { } `; - const rawContent = { + const rawContent: RawContent = { source: "test.js", content: codeContent, mimeType: "application/javascript", + status: FetchStatus.SUCCESS, }; - const codePipeline = pipelines.find((p) => p.canProcess(rawContent)); + const contentBuffer = Buffer.from(codeContent); + const codePipeline = pipelines.find((p) => + p.canProcess(rawContent.mimeType, contentBuffer), + ); expect(codePipeline).toBeDefined(); const result = await codePipeline!.process(rawContent, baseOptions); // Even with large chunk size allowing everything in one chunk, // should still respect logical code boundaries - expect(result.chunks.length).toBeGreaterThanOrEqual(1); + expect(result.chunks?.length).toBeGreaterThanOrEqual(1); // Should maintain code structure - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.types).toContain("code"); expect(chunk.content.trim()).not.toBe(""); }); @@ -468,19 +480,23 @@ class MyClass { ]; for (const testCase of testCases) { - const rawContent = { + const rawContent: RawContent = { source: "test", content: testCase.content, mimeType: testCase.mimeType, + status: FetchStatus.SUCCESS, }; - const pipeline = pipelines.find((p) => p.canProcess(rawContent)); + const contentBuffer = Buffer.from(testCase.content); + const pipeline = pipelines.find((p) => + p.canProcess(rawContent.mimeType, contentBuffer), + ); expect(pipeline).toBeDefined(); const result = await pipeline!.process(rawContent, baseOptions); // All should respect the size constraints - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(250); // Small buffer for edge cases }); } @@ -507,32 +523,39 @@ class MyClass { }; // No pipeline should accept unknown MIME types - const acceptingPipeline = pipelines.find((p) => p.canProcess(unknownContent)); + const contentBuffer = Buffer.from(unknownContent.content); + const acceptingPipeline = pipelines.find((p) => + p.canProcess(unknownContent.mimeType, contentBuffer), + ); expect(acceptingPipeline).toBeUndefined(); // Verify that each pipeline explicitly rejects it pipelines.forEach((pipeline) => { - expect(pipeline.canProcess(unknownContent)).toBe(false); + expect(pipeline.canProcess(unknownContent.mimeType, contentBuffer)).toBe(false); }); }); it("should handle invalid JSON as text content", async () => { const pipelines = PipelineFactory.createStandardPipelines(); - const invalidJsonContent = { + const invalidJsonContent: RawContent = { source: "test.json", content: '{"invalid": json, missing quotes}', mimeType: "application/json", + status: FetchStatus.SUCCESS, }; - const jsonPipeline = pipelines.find((p) => p.canProcess(invalidJsonContent)); + const contentBuffer = Buffer.from(invalidJsonContent.content); + const jsonPipeline = pipelines.find((p) => + p.canProcess(invalidJsonContent.mimeType, contentBuffer), + ); expect(jsonPipeline).toBeDefined(); const result = await jsonPipeline!.process(invalidJsonContent, baseOptions); // Should handle gracefully and process as text-like content - expect(result.chunks.length).toBeGreaterThanOrEqual(1); - expect(result.metadata.isValidJson).toBe(false); + expect(result.chunks?.length).toBeGreaterThanOrEqual(1); + // expect(result.metadata.isValidJson).toBe(false); }); it("should maintain content integrity across different processing paths", async () => { @@ -546,24 +569,28 @@ class MyClass { ]; for (const testCase of testCases) { - const rawContent = { + const rawContent: RawContent = { source: "test", content: testCase.content, mimeType: testCase.mimeType, + status: FetchStatus.SUCCESS, }; - const pipeline = pipelines.find((p) => p.canProcess(rawContent)); + const contentBuffer = Buffer.from(testCase.content); + const pipeline = pipelines.find((p) => + p.canProcess(rawContent.mimeType, contentBuffer), + ); expect(pipeline).toBeDefined(); const result = await pipeline!.process(rawContent, baseOptions); // Content should be preserved (allowing for format conversion) - expect(result.textContent.trim()).not.toBe(""); - expect(result.chunks.length).toBeGreaterThan(0); + expect(result.textContent?.trim()).not.toBe(""); + expect(result.chunks?.length).toBeGreaterThan(0); // Should be able to reconstruct meaningful content const reconstructed = result.chunks - .map((chunk) => chunk.content) + ?.map((chunk) => chunk.content) .join("") .trim(); expect(reconstructed).not.toBe(""); diff --git a/src/scraper/pipelines/SourceCodePipeline.test.ts b/src/scraper/pipelines/SourceCodePipeline.test.ts index 31def2d3..32f1969a 100644 --- a/src/scraper/pipelines/SourceCodePipeline.test.ts +++ b/src/scraper/pipelines/SourceCodePipeline.test.ts @@ -1,5 +1,5 @@ import { beforeEach, describe, expect, it } from "vitest"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import type { ScraperOptions } from "../types"; import { ScrapeMode } from "../types"; import { SourceCodePipeline } from "./SourceCodePipeline"; @@ -32,44 +32,17 @@ describe("SourceCodePipeline", () => { describe("canProcess", () => { it("should accept JavaScript content types", () => { - const jsContent: RawContent = { - content: "function test() {}", - mimeType: "text/javascript", - source: "test.js", - }; - expect(pipeline.canProcess(jsContent)).toBe(true); - - const appJsContent: RawContent = { - content: "const x = 1;", - mimeType: "application/javascript", - source: "test.js", - }; - expect(pipeline.canProcess(appJsContent)).toBe(true); + expect(pipeline.canProcess("text/javascript")).toBe(true); + expect(pipeline.canProcess("application/javascript")).toBe(true); }); it("should accept TypeScript content types", () => { - const tsContent: RawContent = { - content: "interface Test { x: number; }", - mimeType: "text/x-typescript", - source: "test.ts", - }; - expect(pipeline.canProcess(tsContent)).toBe(true); - - const tsxContent: RawContent = { - content: "const Component = () =>
Test
;", - mimeType: "text/x-tsx", - source: "test.tsx", - }; - expect(pipeline.canProcess(tsxContent)).toBe(true); + expect(pipeline.canProcess("text/x-typescript")).toBe(true); + expect(pipeline.canProcess("text/x-tsx")).toBe(true); }); it("should accept JSX content types", () => { - const jsxContent: RawContent = { - content: "const Component = () =>
Test
;", - mimeType: "text/x-jsx", - source: "test.jsx", - }; - expect(pipeline.canProcess(jsxContent)).toBe(true); + expect(pipeline.canProcess("text/x-jsx")).toBe(true); }); it("should reject non-source code content types", () => { @@ -85,22 +58,13 @@ describe("SourceCodePipeline", () => { ]; for (const mimeType of nonCodeTypes) { - const content: RawContent = { - content: "some content", - mimeType, - source: "test.file", - }; - expect(pipeline.canProcess(content)).toBe(false); + expect(pipeline.canProcess(mimeType)).toBe(false); } }); it("should reject content without mime type", () => { - const content: RawContent = { - content: "function test() {}", - mimeType: undefined as any, - source: "test.js", - }; - expect(pipeline.canProcess(content)).toBe(false); + expect(pipeline.canProcess("")).toBe(false); + expect(pipeline.canProcess(undefined as any)).toBe(false); }); }); @@ -112,21 +76,22 @@ describe("SourceCodePipeline", () => { }`, mimeType: "text/javascript", source: "test.js", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(jsContent, baseOptions); expect(result.textContent).toBe(jsContent.content); - expect(result.metadata.language).toBe("javascript"); - expect(result.metadata.isSourceCode).toBe(true); + // expect(result.metadata.language).toBe("javascript"); + // expect(result.metadata.isSourceCode).toBe(true); expect(result.links).toEqual([]); expect(result.errors).toEqual([]); expect(result.chunks).toBeDefined(); expect(Array.isArray(result.chunks)).toBe(true); - expect(result.chunks.length).toBeGreaterThan(0); + expect(result.chunks?.length).toBeGreaterThan(0); // All chunks should be marked as code - result.chunks.forEach((chunk) => { + result.chunks?.forEach((chunk) => { expect(chunk.types).toContain("code"); }); }); @@ -145,17 +110,18 @@ class UserService { }`, mimeType: "text/x-typescript", source: "user.ts", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(tsContent, baseOptions); expect(result.textContent).toBe(tsContent.content); - expect(result.metadata.language).toBe("typescript"); - expect(result.metadata.isSourceCode).toBe(true); - expect(result.chunks.length).toBeGreaterThan(0); + // expect(result.metadata.language).toBe("typescript"); + // expect(result.metadata.isSourceCode).toBe(true); + expect(result.chunks?.length).toBeGreaterThan(0); // Should have at least one chunk with method-level hierarchy - const methodChunk = result.chunks.find( + const methodChunk = result.chunks?.find( (chunk) => chunk.section.path.includes("getUser") || chunk.section.path.includes("UserService"), @@ -170,24 +136,19 @@ class UserService { mimeType: "text/javascript", charset: "utf-8", source: "test.js", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(bufferContent, baseOptions); expect(result.textContent).toBe(codeString); - expect(result.metadata.language).toBe("javascript"); - expect(result.metadata.isSourceCode).toBe(true); + // expect(result.metadata.language).toBe("javascript"); + // expect(result.metadata.isSourceCode).toBe(true); }); it("should reject unknown programming language", async () => { - const unknownContent: RawContent = { - content: "some code in unknown language", - mimeType: "text/x-unknown", - source: "test.unknown", - }; - // Unknown MIME type should be rejected by canProcess - expect(pipeline.canProcess(unknownContent)).toBe(false); + expect(pipeline.canProcess("text/x-unknown")).toBe(false); }); }); @@ -217,15 +178,16 @@ class UserRepository implements Repository { content: tsCode, mimeType: "text/x-typescript", source: "user-repository.ts", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(tsContent, baseOptions); - expect(result.metadata.language).toBe("typescript"); - expect(result.chunks.length).toBeGreaterThan(0); + // expect(result.metadata.language).toBe("typescript"); + expect(result.chunks?.length).toBeGreaterThan(0); // Should preserve TypeScript structure - const hasUserRepositoryContent = result.chunks.some((chunk) => + const hasUserRepositoryContent = result.chunks?.some((chunk) => chunk.section.path.includes("UserRepository"), ); expect(hasUserRepositoryContent).toBe(true); @@ -277,15 +239,16 @@ export default ApiClient;`; content: jsCode, mimeType: "text/javascript", source: "api-client.js", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(jsContent, baseOptions); - expect(result.metadata.language).toBe("javascript"); - expect(result.chunks.length).toBeGreaterThan(0); + // expect(result.metadata.language).toBe("javascript"); + expect(result.chunks?.length).toBeGreaterThan(0); // Should preserve JavaScript structure - const hasApiClientContent = result.chunks.some((chunk) => + const hasApiClientContent = result.chunks?.some((chunk) => chunk.section.path.includes("ApiClient"), ); expect(hasApiClientContent).toBe(true); diff --git a/src/scraper/pipelines/SourceCodePipeline.ts b/src/scraper/pipelines/SourceCodePipeline.ts index fc5381ab..8bf892b4 100644 --- a/src/scraper/pipelines/SourceCodePipeline.ts +++ b/src/scraper/pipelines/SourceCodePipeline.ts @@ -7,7 +7,7 @@ import type { ContentProcessorMiddleware, MiddlewareContext } from "../middlewar import type { ScraperOptions } from "../types"; import { convertToString } from "../utils/buffer"; import { BasePipeline } from "./BasePipeline"; -import type { ProcessedContent } from "./types"; +import type { PipelineResult } from "./types"; /** * Pipeline for processing source code content with semantic, structure-aware splitting. @@ -28,27 +28,28 @@ export class SourceCodePipeline extends BasePipeline { this.splitter = new TreesitterSourceCodeSplitter({ maxChunkSize: chunkSize }); } - canProcess(rawContent: RawContent): boolean { - if (!rawContent.mimeType) return false; - return MimeTypeUtils.isSourceCode(rawContent.mimeType); + canProcess(mimeType: string): boolean { + if (!mimeType) return false; + return MimeTypeUtils.isSourceCode(mimeType); } async process( rawContent: RawContent, options: ScraperOptions, fetcher?: ContentFetcher, - ): Promise { + ): Promise { const contentString = convertToString(rawContent.content, rawContent.charset); const context: MiddlewareContext = { + contentType: rawContent.mimeType || "text/plain", content: contentString, source: rawContent.source, - metadata: { - language: rawContent.mimeType - ? MimeTypeUtils.extractLanguageFromMimeType(rawContent.mimeType) - : "text", - isSourceCode: true, - }, + // metadata: { + // language: rawContent.mimeType + // ? MimeTypeUtils.extractLanguageFromMimeType(rawContent.mimeType) + // : "text", + // isSourceCode: true, + // }, links: [], // Source code files typically don't contain web links errors: [], options, @@ -62,8 +63,9 @@ export class SourceCodePipeline extends BasePipeline { const chunks = await this.splitter.splitText(context.content, rawContent.mimeType); return { + title: context.title, textContent: context.content, - metadata: context.metadata, + // metadata: context.metadata, links: context.links, errors: context.errors, chunks, diff --git a/src/scraper/pipelines/TextPipeline.test.ts b/src/scraper/pipelines/TextPipeline.test.ts index f9e4ab1a..d5b4a096 100644 --- a/src/scraper/pipelines/TextPipeline.test.ts +++ b/src/scraper/pipelines/TextPipeline.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from "vitest"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import type { ScraperOptions } from "../types"; import { ScrapeMode } from "../types"; import { TextPipeline } from "./TextPipeline"; @@ -17,99 +17,33 @@ describe("TextPipeline", () => { describe("canProcess", () => { it("should accept text content types", () => { - const textCases: RawContent[] = [ - { - content: "plain text", - mimeType: "text/plain", - source: "test.txt", - }, - { - content: "markdown content", - mimeType: "text/markdown", - source: "test.md", - }, - { - content: "CSS content", - mimeType: "text/css", - source: "test.css", - }, - ]; - - for (const testCase of textCases) { - expect(pipeline.canProcess(testCase)).toBe(true); - } + expect(pipeline.canProcess("text/plain")).toBe(true); + expect(pipeline.canProcess("text/markdown")).toBe(true); + expect(pipeline.canProcess("text/css")).toBe(true); }); it("should accept safe application types", () => { - const safeCases: RawContent[] = [ - { - content: '', - mimeType: "application/xml", - source: "test.xml", - }, - { - content: "console.log('hello')", - mimeType: "application/javascript", - source: "test.js", - }, - { - content: "name: value", - mimeType: "application/yaml", - source: "test.yaml", - }, - ]; - - for (const testCase of safeCases) { - expect(pipeline.canProcess(testCase)).toBe(true); - } + expect(pipeline.canProcess("application/xml")).toBe(true); + expect(pipeline.canProcess("application/javascript")).toBe(true); + expect(pipeline.canProcess("application/yaml")).toBe(true); }); it("should reject binary content", () => { - const binaryCases: RawContent[] = [ - { - content: Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]), // PNG header - mimeType: "image/png", - source: "test.png", - }, - { - content: "text with null byte\0here", - mimeType: "application/octet-stream", - source: "test.bin", - }, - ]; - - for (const testCase of binaryCases) { - expect(pipeline.canProcess(testCase)).toBe(false); - } + const pngBuffer = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]); // PNG header + expect(pipeline.canProcess("image/png", pngBuffer)).toBe(false); + + const binaryContent = Buffer.from("text with null byte\0here"); + expect(pipeline.canProcess("application/octet-stream", binaryContent)).toBe(false); }); it("should reject unknown application types", () => { - const unknownCases: RawContent[] = [ - { - content: "unknown content", - mimeType: "application/unknown", - source: "test.unknown", - }, - { - content: "video data", - mimeType: "video/mp4", - source: "test.mp4", - }, - ]; - - for (const testCase of unknownCases) { - expect(pipeline.canProcess(testCase)).toBe(false); - } + expect(pipeline.canProcess("application/unknown")).toBe(false); + expect(pipeline.canProcess("video/mp4")).toBe(false); }); it("should reject content without mime type", () => { - const noMimeCase: RawContent = { - content: "content without mime type", - mimeType: undefined as any, - source: "test", - }; - - expect(pipeline.canProcess(noMimeCase)).toBe(false); + expect(pipeline.canProcess("")).toBe(false); + expect(pipeline.canProcess(undefined as any)).toBe(false); }); }); @@ -119,13 +53,14 @@ describe("TextPipeline", () => { content: "This is a simple text document with some content.", mimeType: "text/plain", source: "test.txt", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(textContent, baseOptions); expect(result.textContent).toBe(textContent.content); - expect(result.metadata.contentType).toBe("text/plain"); - expect(result.metadata.isGenericText).toBe(true); + // expect(result.contentType).toBe("text/plain"); + // expect(result.metadata.isGenericText).toBe(true); expect(result.links).toEqual([]); expect(result.errors).toEqual([]); expect(result.chunks).toBeDefined(); @@ -137,13 +72,14 @@ describe("TextPipeline", () => { content: "Some unknown format content", mimeType: "application/unknown", source: "test.unknown", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(unknownContent, baseOptions); expect(result.textContent).toBe(unknownContent.content); - expect(result.metadata.contentType).toBe("application/unknown"); - expect(result.metadata.isGenericText).toBe(true); + // expect(result.contentType).toBe("application/unknown"); + // expect(result.metadata.isGenericText).toBe(true); }); it("should handle content without specific mime type", async () => { @@ -151,13 +87,14 @@ describe("TextPipeline", () => { content: "Generic content", mimeType: "text/plain", source: "test", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(genericContent, baseOptions); expect(result.textContent).toBe(genericContent.content); - expect(result.metadata.contentType).toBe("text/plain"); - expect(result.metadata.isGenericText).toBe(true); + // expect(result.contentType).toBe("text/plain"); + // expect(result.metadata.isGenericText).toBe(true); }); it("should handle Buffer content", async () => { @@ -166,12 +103,13 @@ describe("TextPipeline", () => { mimeType: "text/plain", charset: "utf-8", source: "test.txt", + status: FetchStatus.SUCCESS, }; const result = await pipeline.process(bufferContent, baseOptions); expect(result.textContent).toBe("Buffer content"); - expect(result.metadata.contentType).toBe("text/plain"); + // expect(result.contentType).toBe("text/plain"); }); }); }); diff --git a/src/scraper/pipelines/TextPipeline.ts b/src/scraper/pipelines/TextPipeline.ts index 3691fafd..9591f41a 100644 --- a/src/scraper/pipelines/TextPipeline.ts +++ b/src/scraper/pipelines/TextPipeline.ts @@ -10,7 +10,7 @@ import type { ContentProcessorMiddleware, MiddlewareContext } from "../middlewar import type { ScraperOptions } from "../types"; import { convertToString } from "../utils/buffer"; import { BasePipeline } from "./BasePipeline"; -import type { ProcessedContent } from "./types"; +import type { PipelineResult } from "./types"; /** * Fallback pipeline for processing text content with basic splitting and size optimization. @@ -32,16 +32,16 @@ export class TextPipeline extends BasePipeline { this.splitter = new GreedySplitter(textSplitter, SPLITTER_MIN_CHUNK_SIZE, chunkSize); } - canProcess(rawContent: RawContent): boolean { + canProcess(mimeType: string, content?: string | Buffer): boolean { // This pipeline serves as a fallback for text content, but should not process binary files // First check: MIME type filtering - use utility method for safe types - if (!MimeTypeUtils.isSafeForTextProcessing(rawContent.mimeType)) { + if (!MimeTypeUtils.isSafeForTextProcessing(mimeType)) { return false; } - // Second check: binary detection via null bytes - if (MimeTypeUtils.isBinary(rawContent.content)) { + // Second check: binary detection via null bytes (if content is provided) + if (content && MimeTypeUtils.isBinary(content)) { return false; } @@ -53,16 +53,14 @@ export class TextPipeline extends BasePipeline { rawContent: RawContent, options: ScraperOptions, fetcher?: ContentFetcher, - ): Promise { + ): Promise { const contentString = convertToString(rawContent.content, rawContent.charset); const context: MiddlewareContext = { + title: "", // Title extraction can be added in middleware if needed + contentType: rawContent.mimeType || "text/plain", content: contentString, source: rawContent.source, - metadata: { - contentType: rawContent.mimeType || "text/plain", - isGenericText: true, - }, links: [], // Generic text content typically doesn't contain structured links errors: [], options, @@ -76,8 +74,8 @@ export class TextPipeline extends BasePipeline { const chunks = await this.splitter.splitText(context.content, rawContent.mimeType); return { + title: context.title, textContent: context.content, - metadata: context.metadata, links: context.links, errors: context.errors, chunks, diff --git a/src/scraper/pipelines/types.ts b/src/scraper/pipelines/types.ts index ff2f29bf..39f67a90 100644 --- a/src/scraper/pipelines/types.ts +++ b/src/scraper/pipelines/types.ts @@ -1,21 +1,21 @@ -import type { ContentChunk } from "../../splitter/types"; +import type { Chunk } from "../../splitter/types"; import type { ContentFetcher, RawContent } from "../fetcher/types"; import type { ScraperOptions } from "../types"; /** * Represents the successfully processed content from a pipeline. */ -export interface ProcessedContent { +export interface PipelineResult { + /** The title of the page or document, extracted during processing */ + title?: string | null; /** The final processed content, typically as a string (e.g., Markdown). */ - textContent: string; - /** Extracted metadata (e.g., title, description). */ - metadata: Record; + textContent?: string | null; /** Extracted links from the content. */ - links: string[]; + links?: string[]; /** Any non-critical errors encountered during processing. */ - errors: Error[]; + errors?: Error[]; /** Pre-split chunks from pipeline processing */ - chunks: ContentChunk[]; + chunks?: Chunk[]; } /** @@ -25,11 +25,12 @@ export interface ProcessedContent { */ export interface ContentPipeline { /** - * Determines if this pipeline can process the given raw content. - * @param rawContent The raw content fetched from a source. + * Determines if this pipeline can process content with the given MIME type. + * @param mimeType The MIME type of the content. + * @param content Optional content (string or Buffer) for binary detection (used by TextPipeline). * @returns True if the pipeline can process the content, false otherwise. */ - canProcess(rawContent: RawContent): boolean; + canProcess(mimeType: string, content?: string | Buffer): boolean; /** * Processes the raw content and optionally splits it into chunks. @@ -42,7 +43,7 @@ export interface ContentPipeline { rawContent: RawContent, options: ScraperOptions, fetcher?: ContentFetcher, - ): Promise; + ): Promise; /** * Cleanup resources used by this pipeline (e.g., browser instances, database connections). diff --git a/src/scraper/strategies/BaseScraperStrategy.test.ts b/src/scraper/strategies/BaseScraperStrategy.test.ts index bc282333..884c98fb 100644 --- a/src/scraper/strategies/BaseScraperStrategy.test.ts +++ b/src/scraper/strategies/BaseScraperStrategy.test.ts @@ -1,6 +1,8 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; -import type { ScraperOptions } from "../types"; -import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy"; +import type { ProgressCallback } from "../../types"; +import { FetchStatus } from "../fetcher/types"; +import type { QueueItem, ScraperOptions, ScraperProgressEvent } from "../types"; +import { BaseScraperStrategy } from "./BaseScraperStrategy"; // Mock logger vi.mock("../../utils/logger"); @@ -34,11 +36,18 @@ describe("BaseScraperStrategy", () => { maxPages: 1, maxDepth: 1, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); strategy.processItem.mockResolvedValue({ - document: { content: "test", metadata: {} }, + content: { + textContent: "test", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [], + status: FetchStatus.SUCCESS, }); await strategy.scrape(options, progressCallback); @@ -51,8 +60,19 @@ describe("BaseScraperStrategy", () => { currentUrl: "https://example.com/", depth: 0, maxDepth: 1, - document: { content: "test", metadata: {} }, - }); + pageId: undefined, + result: { + url: "https://example.com/", + title: "", + contentType: "", + textContent: "test", + etag: null, + lastModified: null, + links: [], + errors: [], + chunks: [], + }, + } satisfies ScraperProgressEvent); }); it("should respect maxPages", async () => { @@ -64,11 +84,18 @@ describe("BaseScraperStrategy", () => { maxDepth: 1, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); strategy.processItem.mockResolvedValue({ - document: { content: "test", metadata: {} }, + content: { + textContent: "test", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: ["https://example.com/page2", "https://example.com/page3"], + status: FetchStatus.SUCCESS, }); await strategy.scrape(options, progressCallback); @@ -84,7 +111,7 @@ describe("BaseScraperStrategy", () => { maxDepth: 1, ignoreErrors: true, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const error = new Error("Test error"); strategy.processItem.mockRejectedValue(error); @@ -104,7 +131,7 @@ describe("BaseScraperStrategy", () => { maxDepth: 1, ignoreErrors: false, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const error = new Error("Test error"); strategy.processItem.mockRejectedValue(error); @@ -125,24 +152,38 @@ describe("BaseScraperStrategy", () => { maxPages: 5, maxDepth: 2, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); // Return the same URLs multiple times to simulate duplicate links strategy.processItem.mockImplementation(async (item: QueueItem) => { if (item.url === "https://example.com/") { return { - document: { content: "main page", metadata: {} }, + content: { + textContent: "main page", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [ "https://example.com/page1", "https://example.com/page1", // Duplicate "https://example.com/page2", "https://example.com/page2/", // Duplicate with trailing slash ], + status: FetchStatus.SUCCESS, }; } return { - document: { content: "sub page", metadata: {} }, + content: { + textContent: "sub page", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [], + status: FetchStatus.SUCCESS, }; }); @@ -170,7 +211,7 @@ describe("BaseScraperStrategy", () => { maxPages: 10, maxDepth: 2, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); // First page returns variations of the same URL let firstPageCalled = false; @@ -178,7 +219,13 @@ describe("BaseScraperStrategy", () => { if (item.url === "https://example.com/") { firstPageCalled = true; return { - document: { content: "main page", metadata: {} }, + content: { + textContent: "main page", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [ "https://example.com/path/", "https://example.com/path", // Without trailing slash @@ -186,11 +233,19 @@ describe("BaseScraperStrategy", () => { "https://example.com/path?q=1#anchor", // With anchor "https://example.com/path", // Different case ], + status: FetchStatus.SUCCESS, }; } return { - document: { content: "sub page", metadata: {} }, + content: { + textContent: "sub page", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [], + status: FetchStatus.SUCCESS, }; }); @@ -219,7 +274,7 @@ describe("BaseScraperStrategy", () => { maxDepth: 3, maxConcurrency: 3, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); // Simulate the link structure and timing strategy.processItem.mockImplementation(async (item: QueueItem) => { @@ -251,8 +306,15 @@ describe("BaseScraperStrategy", () => { } // X has no links return { - document: { content: `Content for ${url}`, metadata: {} }, + content: { + textContent: `Content for ${url}`, + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links, + status: FetchStatus.SUCCESS, }; }); @@ -297,19 +359,36 @@ describe("BaseScraperStrategy", () => { maxDepth: 1, includePatterns: ["docs/*"], }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); strategy.processItem.mockImplementation(async (item: QueueItem) => { if (item.url === "https://example.com/docs/start") { return { - document: { content: "main", metadata: {} }, + content: { + textContent: "main", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [ "https://example.com/docs/intro", "https://example.com/docs/other", "https://example.com/api/should-not-include", ], + status: FetchStatus.SUCCESS, }; } - return { document: { content: "sub", metadata: {} }, links: [] }; + return { + content: { + textContent: "sub", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; }); await strategy.scrape(options, progressCallback); const processedUrls = strategy.processItem.mock.calls.map((call) => call[0].url); @@ -328,19 +407,36 @@ describe("BaseScraperStrategy", () => { maxDepth: 1, includePatterns: ["/docs\\/intro.*/"], }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); strategy.processItem.mockImplementation(async (item: QueueItem) => { if (item.url === "https://example.com/docs/start") { return { - document: { content: "main", metadata: {} }, + content: { + textContent: "main", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [ "https://example.com/docs/intro", "https://example.com/docs/intro2", "https://example.com/docs/other", ], + status: FetchStatus.SUCCESS, }; } - return { document: { content: "sub", metadata: {} }, links: [] }; + return { + content: { + textContent: "sub", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; }); await strategy.scrape(options, progressCallback); const processedUrls = strategy.processItem.mock.calls.map((call) => call[0].url); @@ -358,19 +454,36 @@ describe("BaseScraperStrategy", () => { maxDepth: 1, excludePatterns: ["docs/private/*"], }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); strategy.processItem.mockImplementation(async (item: QueueItem) => { if (item.url === "https://example.com/docs/start") { return { - document: { content: "main", metadata: {} }, + content: { + textContent: "main", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [ "https://example.com/docs/intro", "https://example.com/docs/private/secret", "https://example.com/docs/other", ], + status: FetchStatus.SUCCESS, }; } - return { document: { content: "sub", metadata: {} }, links: [] }; + return { + content: { + textContent: "sub", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; }); await strategy.scrape(options, progressCallback); const processedUrls = strategy.processItem.mock.calls.map((call) => call[0].url); @@ -388,19 +501,36 @@ describe("BaseScraperStrategy", () => { maxDepth: 1, excludePatterns: ["/private/"], }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); strategy.processItem.mockImplementation(async (item: QueueItem) => { if (item.url === "https://example.com/docs/start") { return { - document: { content: "main", metadata: {} }, + content: { + textContent: "main", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [ "https://example.com/docs/intro", "https://example.com/docs/private/secret", "https://example.com/docs/other", ], + status: FetchStatus.SUCCESS, }; } - return { document: { content: "sub", metadata: {} }, links: [] }; + return { + content: { + textContent: "sub", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; }); await strategy.scrape(options, progressCallback); const processedUrls = strategy.processItem.mock.calls.map((call) => call[0].url); @@ -419,19 +549,36 @@ describe("BaseScraperStrategy", () => { includePatterns: ["docs/*"], excludePatterns: ["docs/private/*"], }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); strategy.processItem.mockImplementation(async (item: QueueItem) => { if (item.url === "https://example.com/docs/start") { return { - document: { content: "main", metadata: {} }, + content: { + textContent: "main", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, links: [ "https://example.com/docs/intro", "https://example.com/docs/private/secret", "https://example.com/docs/other", ], + status: FetchStatus.SUCCESS, }; } - return { document: { content: "sub", metadata: {} }, links: [] }; + return { + content: { + textContent: "sub", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; }); await strategy.scrape(options, progressCallback); const processedUrls = strategy.processItem.mock.calls.map((call) => call[0].url); @@ -440,4 +587,480 @@ describe("BaseScraperStrategy", () => { expect(processedUrls).not.toContain("https://example.com/docs/private/secret"); }); }); + + describe("Refresh mode with initialQueue", () => { + beforeEach(() => { + strategy = new TestScraperStrategy(); + strategy.processItem.mockClear(); + }); + + it("should prioritize initialQueue items before discovering new links", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 10, + maxDepth: 2, + initialQueue: [ + { + url: "https://example.com/existing-page1", + depth: 1, + pageId: 101, + etag: "etag1", + }, + { + url: "https://example.com/existing-page2", + depth: 1, + pageId: 102, + etag: "etag2", + }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockImplementation(async (item: QueueItem) => { + if (item.url === "https://example.com/") { + return { + content: { + textContent: "root", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: ["https://example.com/new-page"], + status: FetchStatus.SUCCESS, + }; + } + return { + content: { + textContent: "page content", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; + }); + + await strategy.scrape(options, progressCallback); + + // Verify initialQueue items are processed before discovered links + const processedUrls = strategy.processItem.mock.calls.map((call) => call[0].url); + const rootIndex = processedUrls.indexOf("https://example.com/"); + const existing1Index = processedUrls.indexOf("https://example.com/existing-page1"); + const existing2Index = processedUrls.indexOf("https://example.com/existing-page2"); + const newPageIndex = processedUrls.indexOf("https://example.com/new-page"); + + // Root URL should be processed first (it's added before initialQueue items) + expect(rootIndex).toBe(0); + + // InitialQueue items should be processed before newly discovered links + expect(existing1Index).toBeLessThan(newPageIndex); + expect(existing2Index).toBeLessThan(newPageIndex); + }); + + it("should preserve pageId from initialQueue items", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 10, + maxDepth: 2, + initialQueue: [ + { + url: "https://example.com/page1", + depth: 1, + pageId: 123, + etag: "etag1", + }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockResolvedValue({ + content: { + textContent: "test", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }); + + await strategy.scrape(options, progressCallback); + + // Verify pageId flows through to processItem call + const page1Call = strategy.processItem.mock.calls.find( + (call) => call[0].url === "https://example.com/page1", + ); + expect(page1Call).toBeDefined(); + expect(page1Call![0].pageId).toBe(123); + }); + + it("should preserve etag from initialQueue items", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 10, + maxDepth: 2, + initialQueue: [ + { + url: "https://example.com/page1", + depth: 1, + pageId: 123, + etag: '"test-etag-123"', + }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockResolvedValue({ + content: { + textContent: "test", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }); + + await strategy.scrape(options, progressCallback); + + // Verify etag flows through to processItem call + const page1Call = strategy.processItem.mock.calls.find( + (call) => call[0].url === "https://example.com/page1", + ); + expect(page1Call).toBeDefined(); + expect(page1Call![0].etag).toBe('"test-etag-123"'); + }); + + it("should not duplicate root URL if already in initialQueue", async () => { + const rootUrl = "https://example.com/"; + const options: ScraperOptions = { + url: rootUrl, + library: "test", + version: "1.0.0", + maxPages: 10, + maxDepth: 2, + initialQueue: [ + { + url: rootUrl, + depth: 0, + pageId: 100, + etag: '"root-etag"', + }, + { + url: "https://example.com/page1", + depth: 1, + pageId: 101, + etag: '"page1-etag"', + }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockResolvedValue({ + content: { + textContent: "test", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }); + + await strategy.scrape(options, progressCallback); + + // Count how many times root URL was processed + const rootCalls = strategy.processItem.mock.calls.filter( + (call) => call[0].url === rootUrl, + ); + expect(rootCalls).toHaveLength(1); + + // Verify it used the pageId and etag from initialQueue + expect(rootCalls[0][0].pageId).toBe(100); + expect(rootCalls[0][0].etag).toBe('"root-etag"'); + }); + }); + + describe("Page counting with different fetch statuses", () => { + beforeEach(() => { + strategy = new TestScraperStrategy(); + strategy.processItem.mockClear(); + }); + + it("should count pages that return 200 OK", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 3, + maxDepth: 1, + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockResolvedValue({ + content: { + textContent: "test", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: ["https://example.com/page1", "https://example.com/page2"], + status: FetchStatus.SUCCESS, + }); + + await strategy.scrape(options, progressCallback); + + // Verify all 3 pages were counted (root + 2 links) + expect(progressCallback).toHaveBeenCalledTimes(3); + const lastCall = progressCallback.mock.calls[2][0]; + expect(lastCall.pagesScraped).toBe(3); + }); + + it("should count pages that return 304 Not Modified", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 3, + maxDepth: 1, + initialQueue: [ + { url: "https://example.com/page1", depth: 1, pageId: 101, etag: "etag1" }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockImplementation(async (item: QueueItem) => { + if (item.url === "https://example.com/") { + return { + content: { + textContent: "root", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: ["https://example.com/page1"], + status: FetchStatus.SUCCESS, + }; + } + // page1 returns 304 + return { + content: null, + links: [], + status: FetchStatus.NOT_MODIFIED, + etag: "etag1", + }; + }); + + await strategy.scrape(options, progressCallback); + + // Verify both pages were counted (root=200, page1=304) + expect(progressCallback).toHaveBeenCalledTimes(2); + const lastCall = progressCallback.mock.calls[1][0]; + expect(lastCall.pagesScraped).toBe(2); + }); + + it("should count pages that return 404 Not Found", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 3, + maxDepth: 1, + initialQueue: [ + { + url: "https://example.com/deleted-page", + depth: 1, + pageId: 101, + etag: "etag1", + }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockImplementation(async (item: QueueItem) => { + if (item.url === "https://example.com/") { + return { + content: { + textContent: "root", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; + } + // deleted-page returns 404 + return { + content: null, + links: [], + status: FetchStatus.NOT_FOUND, + }; + }); + + await strategy.scrape(options, progressCallback); + + // Verify both pages were counted (root=200, deleted-page=404) + expect(progressCallback).toHaveBeenCalledTimes(2); + const lastCall = progressCallback.mock.calls[1][0]; + expect(lastCall.pagesScraped).toBe(2); + }); + }); + + describe("Progress callbacks with different statuses", () => { + beforeEach(() => { + strategy = new TestScraperStrategy(); + strategy.processItem.mockClear(); + }); + + it("should call progressCallback with result=null for 304 responses", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 2, + maxDepth: 1, + initialQueue: [ + { url: "https://example.com/page1", depth: 1, pageId: 101, etag: "etag1" }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockImplementation(async (item: QueueItem) => { + if (item.url === "https://example.com/") { + return { + content: { + textContent: "root", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; + } + // page1 returns 304 + return { + content: null, + links: [], + status: FetchStatus.NOT_MODIFIED, + etag: "etag1", + }; + }); + + await strategy.scrape(options, progressCallback); + + // Find the 304 response progress call + const progress304 = progressCallback.mock.calls.find( + (call) => call[0].currentUrl === "https://example.com/page1", + ); + expect(progress304).toBeDefined(); + expect(progress304![0].result).toBeNull(); + }); + + it("should call progressCallback with deleted=true for 404 responses", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 2, + maxDepth: 1, + initialQueue: [ + { url: "https://example.com/deleted", depth: 1, pageId: 101, etag: "etag1" }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockImplementation(async (item: QueueItem) => { + if (item.url === "https://example.com/") { + return { + content: { + textContent: "root", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }; + } + // deleted page returns 404 + return { + content: null, + links: [], + status: FetchStatus.NOT_FOUND, + }; + }); + + await strategy.scrape(options, progressCallback); + + // Find the 404 response progress call + const progress404 = progressCallback.mock.calls.find( + (call) => call[0].currentUrl === "https://example.com/deleted", + ); + expect(progress404).toBeDefined(); + expect(progress404![0].deleted).toBe(true); + expect(progress404![0].result).toBeNull(); + }); + + it("should include pageId in progress for refresh operations", async () => { + const options: ScraperOptions = { + url: "https://example.com/", + library: "test", + version: "1.0.0", + maxPages: 3, + maxDepth: 1, + initialQueue: [ + { url: "https://example.com/page1", depth: 1, pageId: 101, etag: "etag1" }, + { url: "https://example.com/page2", depth: 1, pageId: 102, etag: "etag2" }, + ], + }; + const progressCallback = vi.fn>(); + + strategy.processItem.mockResolvedValue({ + content: { + textContent: "test", + metadata: {}, + links: [], + errors: [], + chunks: [], + }, + links: [], + status: FetchStatus.SUCCESS, + }); + + await strategy.scrape(options, progressCallback); + + // Verify pageId flows through to progress events for initialQueue items + const page1Progress = progressCallback.mock.calls.find( + (call) => call[0].currentUrl === "https://example.com/page1", + ); + const page2Progress = progressCallback.mock.calls.find( + (call) => call[0].currentUrl === "https://example.com/page2", + ); + + expect(page1Progress).toBeDefined(); + expect(page1Progress![0].pageId).toBe(101); + + expect(page2Progress).toBeDefined(); + expect(page2Progress![0].pageId).toBe(102); + }); + }); }); diff --git a/src/scraper/strategies/BaseScraperStrategy.ts b/src/scraper/strategies/BaseScraperStrategy.ts index 67aeab9e..17287d45 100644 --- a/src/scraper/strategies/BaseScraperStrategy.ts +++ b/src/scraper/strategies/BaseScraperStrategy.ts @@ -1,10 +1,18 @@ import { URL } from "node:url"; import { CancellationError } from "../../pipeline/errors"; -import type { Document, ProgressCallback } from "../../types"; +import type { ProgressCallback } from "../../types"; import { DEFAULT_MAX_PAGES } from "../../utils/config"; import { logger } from "../../utils/logger"; import { normalizeUrl, type UrlNormalizerOptions } from "../../utils/url"; -import type { ScraperOptions, ScraperProgress, ScraperStrategy } from "../types"; +import { FetchStatus } from "../fetcher/types"; +import type { PipelineResult } from "../pipelines/types"; +import type { + QueueItem, + ScrapeResult, + ScraperOptions, + ScraperProgressEvent, + ScraperStrategy, +} from "../types"; import { shouldIncludeUrl } from "../utils/patternMatcher"; import { isInScope } from "../utils/scope"; @@ -12,16 +20,52 @@ import { isInScope } from "../utils/scope"; const DEFAULT_MAX_DEPTH = 3; const DEFAULT_CONCURRENCY = 3; -export type QueueItem = { - url: string; - depth: number; -}; - export interface BaseScraperStrategyOptions { urlNormalizerOptions?: UrlNormalizerOptions; } +/** + * Result of processing a single queue item. + * - processed: The processed content (when available) + * - links: Discovered links for crawling (may exist without content, e.g., directories) + * - status: The fetch status (SUCCESS, NOT_MODIFIED, NOT_FOUND) + */ +export interface ProcessItemResult { + /** The URL of the content */ + url: string; + /** The title of the page or document, extracted during processing */ + title?: string | null; + /** The MIME type of the content being processed, if known */ + contentType?: string | null; + /** The ETag header value from the HTTP response, if available, used for caching and change detection. */ + etag?: string | null; + /** The Last-Modified header value, if available, used for caching and change detection. */ + lastModified?: string | null; + /** The pipeline-processed content, including title, text content, links, errors, and chunks. This may be null if the content was not successfully processed (e.g., 404 or 304). */ + content?: PipelineResult; + /** Extracted links from the content. This may be an empty array if no links were found or if the content was not processed. */ + links?: string[]; + /** Any non-critical errors encountered during processing. This may be an empty array if no errors were encountered or if the content was not processed. */ + status: FetchStatus; +} + export abstract class BaseScraperStrategy implements ScraperStrategy { + /** + * Set of normalized URLs that have been marked for processing. + * + * IMPORTANT: URLs are added to this set BEFORE they are actually processed, not after. + * This prevents the same URL from being queued multiple times when discovered from different sources. + * + * Usage flow: + * 1. Initial queue setup: Root URL and initialQueue items are added to visited + * 2. During processing: When a page returns links, each link is checked against visited + * 3. In processBatch deduplication: Only links NOT in visited are added to the queue AND to visited + * + * This approach ensures: + * - No URL is processed more than once + * - No URL appears in the queue multiple times + * - Efficient deduplication across concurrent processing + */ protected visited = new Set(); protected pageCount = 0; protected totalDiscovered = 0; // Track total URLs discovered (unlimited) @@ -56,26 +100,19 @@ export abstract class BaseScraperStrategy implements ScraperStrategy { /** * Process a single item from the queue. * - * @returns A list of URLs to add to the queue + * @returns Processed content, links, and metadata */ protected abstract processItem( item: QueueItem, options: ScraperOptions, - progressCallback?: ProgressCallback, - signal?: AbortSignal, // Add signal - ): Promise<{ - document?: Document; - links?: string[]; - finalUrl?: string; // Effective fetched URL (post-redirect) - }>; - - // Removed getProcessor method as processing is now handled by strategies using middleware pipelines + signal?: AbortSignal, + ): Promise; protected async processBatch( batch: QueueItem[], baseUrl: URL, options: ScraperOptions, - progressCallback: ProgressCallback, + progressCallback: ProgressCallback, signal?: AbortSignal, // Add signal ): Promise { const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES; @@ -93,52 +130,101 @@ export abstract class BaseScraperStrategy implements ScraperStrategy { try { // Pass signal to processItem - const result = await this.processItem(item, options, undefined, signal); - // If this is the root (depth 0) and we have a finalUrl differing from original, set canonicalBaseUrl - if (item.depth === 0 && !this.canonicalBaseUrl && result?.finalUrl) { - try { - const finalUrlStr = result.finalUrl as string; - const original = new URL(options.url); - const finalUrlObj = new URL(finalUrlStr); - if ( - finalUrlObj.href !== original.href && - (finalUrlObj.protocol === "http:" || finalUrlObj.protocol === "https:") - ) { - this.canonicalBaseUrl = finalUrlObj; - logger.debug( - `Updated scope base after redirect: ${original.href} -> ${finalUrlObj.href}`, - ); - } else { - this.canonicalBaseUrl = original; - } - } catch { - // Ignore canonical base errors - this.canonicalBaseUrl = new URL(options.url); - } - } + const result = await this.processItem(item, options, signal); - if (result.document) { - this.pageCount++; - // maxDepth already resolved above + // Only count items that represent tracked pages or have actual content + // - Refresh operations (have pageId): Always count (they're tracked in DB) + // - New files with content: Count (they're being indexed) + // - Directory discovery (no pageId, no content): Don't count + const shouldCount = item.pageId !== undefined || result.content !== undefined; + + let currentPageCount = this.pageCount; + if (shouldCount) { + currentPageCount = ++this.pageCount; + + // Log progress for all counted items logger.info( - `🌐 Scraping page ${this.pageCount}/${this.effectiveTotal} (depth ${item.depth}/${maxDepth}): ${item.url}`, + `🌐 Scraping page ${currentPageCount}/${this.effectiveTotal} (depth ${item.depth}/${maxDepth}): ${item.url}`, ); + } + + if (result.status === FetchStatus.NOT_MODIFIED) { + // File/page hasn't changed, skip processing but count as processed + logger.debug(`Page unchanged (304): ${item.url}`); + if (shouldCount) { + await progressCallback({ + pagesScraped: currentPageCount, + totalPages: this.effectiveTotal, + totalDiscovered: this.totalDiscovered, + currentUrl: item.url, + depth: item.depth, + maxDepth: maxDepth, + result: null, + pageId: item.pageId, + }); + } + return []; + } + + if (result.status === FetchStatus.NOT_FOUND) { + // File/page was deleted, count as processed + logger.debug(`Page deleted (404): ${item.url}`); + if (shouldCount) { + await progressCallback({ + pagesScraped: currentPageCount, + totalPages: this.effectiveTotal, + totalDiscovered: this.totalDiscovered, + currentUrl: item.url, + depth: item.depth, + maxDepth: maxDepth, + result: null, + pageId: item.pageId, + deleted: true, + }); + } + return []; + } + + if (result.status !== FetchStatus.SUCCESS) { + logger.error(`Unknown fetch status: ${result.status}`); + return []; + } + + // Handle successful processing - report result with content + // Use the final URL from the result (which may differ due to redirects) + const finalUrl = result.url || item.url; + + if (result.content) { await progressCallback({ - pagesScraped: this.pageCount, + pagesScraped: currentPageCount, totalPages: this.effectiveTotal, totalDiscovered: this.totalDiscovered, - currentUrl: item.url, + currentUrl: finalUrl, depth: item.depth, maxDepth: maxDepth, - document: result.document, + result: { + url: finalUrl, + title: result.content.title?.trim() || result.title?.trim() || "", + contentType: result.contentType || "", + textContent: result.content.textContent || "", + links: result.content.links || [], + errors: result.content.errors || [], + chunks: result.content.chunks || [], + etag: result.etag || null, + lastModified: result.lastModified || null, + } satisfies ScrapeResult, + pageId: item.pageId, }); } + // Extract discovered links - use the final URL as the base for resolving relative links const nextItems = result.links || []; + const linkBaseUrl = finalUrl ? new URL(finalUrl) : baseUrl; + return nextItems .map((value) => { try { - const targetUrl = new URL(value, baseUrl); + const targetUrl = new URL(value, linkBaseUrl); // Filter using shouldProcessUrl if (!this.shouldProcessUrl(targetUrl.href, options)) { return null; @@ -190,46 +276,76 @@ export abstract class BaseScraperStrategy implements ScraperStrategy { async scrape( options: ScraperOptions, - progressCallback: ProgressCallback, + progressCallback: ProgressCallback, signal?: AbortSignal, // Add signal ): Promise { this.visited.clear(); this.pageCount = 0; - this.totalDiscovered = 1; // Start with the initial URL (unlimited counter) - this.effectiveTotal = 1; // Start with the initial URL (limited counter) + // Check if this is a refresh operation with pre-populated queue + const initialQueue = options.initialQueue || []; + const isRefreshMode = initialQueue.length > 0; + + // Set up base URL and queue this.canonicalBaseUrl = new URL(options.url); let baseUrl = this.canonicalBaseUrl; - const queue = [{ url: options.url, depth: 0 } satisfies QueueItem]; - // Track values we've seen (either queued or visited) - this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions)); + // Initialize queue: Start with root URL or use items from initialQueue (refresh mode) + // The root URL is always processed (depth 0), but if it's in initialQueue, use that + // version to preserve etag/pageId for conditional fetching + const queue: QueueItem[] = []; + const normalizedRootUrl = normalizeUrl( + options.url, + this.options.urlNormalizerOptions, + ); + + if (isRefreshMode) { + logger.debug( + `Starting refresh mode with ${initialQueue.length} pre-populated pages`, + ); + + // Add all items from initialQueue, using visited set to deduplicate + for (const item of initialQueue) { + const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions); + if (!this.visited.has(normalizedUrl)) { + this.visited.add(normalizedUrl); + queue.push(item); + } + } + } + + // If root URL wasn't in initialQueue, add it now at depth 0 + if (!this.visited.has(normalizedRootUrl)) { + this.visited.add(normalizedRootUrl); + queue.unshift({ url: options.url, depth: 0 } satisfies QueueItem); + } + + // Initialize counters based on actual queue length after population + this.totalDiscovered = queue.length; + this.effectiveTotal = queue.length; // Resolve optional values to defaults using temporary variables const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES; const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY; + // Unified processing loop for both normal and refresh modes while (queue.length > 0 && this.pageCount < maxPages) { - // Use variable // Check for cancellation at the start of each loop iteration if (signal?.aborted) { - logger.debug("Scraping cancelled by signal."); - throw new CancellationError("Scraping cancelled by signal"); + logger.debug(`${isRefreshMode ? "Refresh" : "Scraping"} cancelled by signal.`); + throw new CancellationError( + `${isRefreshMode ? "Refresh" : "Scraping"} cancelled by signal`, + ); } - const remainingPages = maxPages - this.pageCount; // Use variable + const remainingPages = maxPages - this.pageCount; if (remainingPages <= 0) { break; } - const batchSize = Math.min( - maxConcurrency, // Use variable - remainingPages, - queue.length, - ); - + const batchSize = Math.min(maxConcurrency, remainingPages, queue.length); const batch = queue.splice(0, batchSize); - // Pass signal to processBatch + // Always use latest canonical base (may have been updated after first fetch) baseUrl = this.canonicalBaseUrl ?? baseUrl; const newUrls = await this.processBatch( diff --git a/src/scraper/strategies/GitHubRepoProcessor.ts b/src/scraper/strategies/GitHubRepoProcessor.ts new file mode 100644 index 00000000..cc100415 --- /dev/null +++ b/src/scraper/strategies/GitHubRepoProcessor.ts @@ -0,0 +1,179 @@ +import { logger } from "../../utils/logger"; +import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; +import { HttpFetcher } from "../fetcher"; +import { FetchStatus, type RawContent } from "../fetcher/types"; +import { PipelineFactory } from "../pipelines/PipelineFactory"; +import type { ContentPipeline, PipelineResult } from "../pipelines/types"; +import type { QueueItem } from "../types"; +import { ScrapeMode, type ScraperOptions } from "../types"; +import type { ProcessItemResult } from "./BaseScraperStrategy"; + +export interface GitHubRepoInfo { + owner: string; + repo: string; + branch?: string; + subPath?: string; +} + +export interface GitHubTreeItem { + path: string; + type: "blob" | "tree"; + sha: string; + size?: number; + url: string; +} + +export interface GitHubTreeResponse { + sha: string; + url: string; + tree: GitHubTreeItem[]; + truncated: boolean; +} + +/** + * GitHubRepoProcessor handles processing individual files from GitHub repositories. + * It processes HTTPS blob URLs (https://github.com/owner/repo/blob/branch/filepath). + * + * This processor is stateless and contains the core logic from GitHubRepoScraperStrategy. + */ +export class GitHubRepoProcessor { + private readonly httpFetcher = new HttpFetcher(); + private readonly pipelines: ContentPipeline[]; + + constructor() { + this.pipelines = PipelineFactory.createStandardPipelines(); + } + + /** + * Parses an HTTPS blob URL to extract repository information. + * Format: https://github.com/owner/repo/blob/branch/filepath + */ + parseHttpsBlobUrl(url: string): GitHubRepoInfo & { filePath: string } { + const parsedUrl = new URL(url); + const segments = parsedUrl.pathname.split("/").filter(Boolean); + + // Expected format: /owner/repo/blob/branch/filepath + if (segments.length < 5 || segments[2] !== "blob") { + throw new Error( + `Invalid GitHub blob URL format. Expected: https://github.com/owner/repo/blob/branch/filepath. Got: ${url}`, + ); + } + + const owner = segments[0]; + const repo = segments[1]; + const branch = segments[3]; + const filePath = segments.slice(4).join("/"); + + return { owner, repo, branch, filePath }; + } + + /** + * Fetches the raw content of a file from GitHub. + */ + private async fetchFileContent( + repoInfo: GitHubRepoInfo, + filePath: string, + etag?: string | null, + signal?: AbortSignal, + ): Promise { + const { owner, repo, branch } = repoInfo; + const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`; + + const rawContent = await this.httpFetcher.fetch(rawUrl, { signal, etag }); + + // Override GitHub's generic 'text/plain' MIME type with file extension-based detection + const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath); + if (detectedMimeType && rawContent.mimeType === "text/plain") { + return { + ...rawContent, + mimeType: detectedMimeType, + }; + } + + return rawContent; + } + + /** + * Processes a single GitHub repository file from an HTTPS blob URL. + */ + async process( + item: QueueItem, + options: ScraperOptions, + signal?: AbortSignal, + ): Promise { + // Parse the HTTPS blob URL to extract repository info and file path + const repoInfo = this.parseHttpsBlobUrl(item.url); + const { owner, repo, branch, filePath } = repoInfo; + + // Fetch the file content from raw.githubusercontent.com + const rawContent = await this.fetchFileContent( + { owner, repo, branch }, + filePath, + item.etag, + signal, + ); + + // Return the status directly - BaseScraperStrategy handles NOT_MODIFIED and NOT_FOUND + if (rawContent.status !== FetchStatus.SUCCESS) { + return { url: item.url, links: [], status: rawContent.status }; + } + + // Process content through appropriate pipeline + let processed: PipelineResult | undefined; + + for (const pipeline of this.pipelines) { + const contentBuffer = Buffer.isBuffer(rawContent.content) + ? rawContent.content + : Buffer.from(rawContent.content); + if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) { + logger.debug( + `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`, + ); + + // Force 'fetch' mode for GitHub to avoid unnecessary Playwright usage on raw content. + // GitHub raw files (e.g., HTML files) don't have their dependencies available at the + // raw.githubusercontent.com domain, so rendering them in a browser would be broken + // and provide no additional value over direct HTML parsing with Cheerio. + const gitHubOptions = { ...options, scrapeMode: ScrapeMode.Fetch }; + + processed = await pipeline.process(rawContent, gitHubOptions, this.httpFetcher); + break; + } + } + + if (!processed) { + logger.warn( + `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`, + ); + return { url: item.url, links: [], status: FetchStatus.SUCCESS }; + } + + for (const err of processed.errors ?? []) { + logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`); + } + + // Create document with GitHub-specific metadata + const githubUrl = `https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`; + + // Use filename as fallback if title is empty or not a string + const filename = filePath.split("/").pop() || "Untitled"; + + return { + url: githubUrl, + title: processed.title?.trim() || filename || "Untitled", + etag: rawContent.etag, + lastModified: rawContent.lastModified, + contentType: rawContent.mimeType, + content: processed, + links: [], // Always return empty links array for individual files + status: FetchStatus.SUCCESS, + }; + } + + /** + * Cleanup resources used by this processor. + */ + async cleanup(): Promise { + await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close())); + } +} diff --git a/src/scraper/strategies/GitHubRepoScraperStrategy.test.ts b/src/scraper/strategies/GitHubRepoScraperStrategy.test.ts deleted file mode 100644 index 14407bb8..00000000 --- a/src/scraper/strategies/GitHubRepoScraperStrategy.test.ts +++ /dev/null @@ -1,437 +0,0 @@ -import { beforeEach, describe, expect, it, vi } from "vitest"; -import { HttpFetcher } from "../fetcher"; -import type { RawContent } from "../fetcher/types"; -import { HtmlPipeline } from "../pipelines/HtmlPipeline"; -import { MarkdownPipeline } from "../pipelines/MarkdownPipeline"; -import type { ScraperOptions } from "../types"; -import { GitHubRepoScraperStrategy } from "./GitHubRepoScraperStrategy"; - -// Mock the fetcher and pipelines -vi.mock("../fetcher"); -vi.mock("../pipelines/HtmlPipeline"); -vi.mock("../pipelines/MarkdownPipeline"); - -const mockHttpFetcher = vi.mocked(HttpFetcher); -const mockHtmlPipeline = vi.mocked(HtmlPipeline); -const mockMarkdownPipeline = vi.mocked(MarkdownPipeline); - -describe("GitHubRepoScraperStrategy", () => { - let strategy: GitHubRepoScraperStrategy; - let httpFetcherInstance: any; - let htmlPipelineInstance: any; - let markdownPipelineInstance: any; - - beforeEach(() => { - vi.clearAllMocks(); - - // Setup fetcher mock - httpFetcherInstance = { - fetch: vi.fn(), - }; - mockHttpFetcher.mockImplementation(() => httpFetcherInstance); - - // Setup pipeline mocks - htmlPipelineInstance = { - canProcess: vi.fn(), - process: vi.fn(), - close: vi.fn(), - }; - markdownPipelineInstance = { - canProcess: vi.fn(), - process: vi.fn(), - close: vi.fn(), - }; - mockHtmlPipeline.mockImplementation(() => htmlPipelineInstance); - mockMarkdownPipeline.mockImplementation(() => markdownPipelineInstance); - - strategy = new GitHubRepoScraperStrategy(); - }); - - describe("canHandle", () => { - it("should handle GitHub URLs", () => { - expect(strategy.canHandle("https://github.com/owner/repo")).toBe(true); - expect(strategy.canHandle("https://www.github.com/owner/repo")).toBe(true); - expect(strategy.canHandle("https://github.com/owner/repo/tree/main")).toBe(true); - expect( - strategy.canHandle("https://github.com/owner/repo/blob/main/README.md"), - ).toBe(true); - }); - - it("should not handle non-GitHub URLs", () => { - expect(strategy.canHandle("https://gitlab.com/owner/repo")).toBe(false); - expect(strategy.canHandle("https://bitbucket.org/owner/repo")).toBe(false); - expect(strategy.canHandle("https://example.com")).toBe(false); - }); - }); - - describe("parseGitHubUrl", () => { - it("should parse basic repository URL", () => { - const result = strategy.parseGitHubUrl("https://github.com/owner/repo"); - expect(result).toEqual({ owner: "owner", repo: "repo" }); - }); - - it("should parse tree URL with branch", () => { - const result = strategy.parseGitHubUrl("https://github.com/owner/repo/tree/main"); - expect(result).toEqual({ owner: "owner", repo: "repo", branch: "main" }); - }); - - it("should parse tree URL with branch and subpath", () => { - const result = strategy.parseGitHubUrl( - "https://github.com/owner/repo/tree/main/docs", - ); - expect(result).toEqual({ - owner: "owner", - repo: "repo", - branch: "main", - subPath: "docs", - }); - }); - - it("should parse blob URL", () => { - const result = strategy.parseGitHubUrl( - "https://github.com/owner/repo/blob/main/README.md", - ); - expect(result).toEqual({ - owner: "owner", - repo: "repo", - branch: "main", - filePath: "README.md", - isBlob: true, - }); - }); - - it("should parse blob URL without file path", () => { - const result = strategy.parseGitHubUrl("https://github.com/owner/repo/blob/main"); - expect(result).toEqual({ - owner: "owner", - repo: "repo", - branch: "main", - filePath: undefined, - isBlob: true, - }); - }); - - it("should throw error for invalid repository URL", () => { - expect(() => { - strategy.parseGitHubUrl("https://github.com/invalid"); - }).toThrow("Invalid GitHub repository URL"); - }); - }); - - describe("processItem", () => { - const options: ScraperOptions = { - url: "https://github.com/owner/repo", - library: "test-lib", - version: "1.0.0", - }; - - beforeEach(() => { - // Mock repository info response - httpFetcherInstance.fetch.mockImplementation((url: string) => { - if (url.includes("api.github.com/repos")) { - return Promise.resolve({ - content: JSON.stringify({ default_branch: "main" }), - mimeType: "application/json", - source: url, - charset: "utf-8", - }); - } - if (url.includes("git/trees")) { - return Promise.resolve({ - content: JSON.stringify({ - sha: "tree123", - url: "https://api.github.com/repos/owner/repo/git/trees/tree123", - tree: [ - { - path: "README.md", - type: "blob", - sha: "abc123", - size: 1024, - url: "https://api.github.com/repos/owner/repo/git/blobs/abc123", - }, - { - path: "src/index.js", - type: "blob", - sha: "def456", - size: 512, - url: "https://api.github.com/repos/owner/repo/git/blobs/def456", - }, - { - path: "binary-file.png", - type: "blob", - sha: "ghi789", - size: 2048, - url: "https://api.github.com/repos/owner/repo/git/blobs/ghi789", - }, - ], - truncated: false, - }), - mimeType: "application/json", - source: url, - charset: "utf-8", - }); - } - return Promise.resolve({ - content: "file content", - mimeType: "text/plain", - source: url, - charset: "utf-8", - }); - }); - }); - - it("should discover repository structure and return file links", async () => { - const item = { url: "https://github.com/owner/repo", depth: 0 }; - - // Mock the fetchRepositoryTree method directly since it's a complex interaction - const mockFetchRepositoryTree = vi - .spyOn(strategy as any, "fetchRepositoryTree") - .mockResolvedValue({ - tree: { - sha: "tree123", - url: "https://api.github.com/repos/owner/repo/git/trees/tree123", - tree: [ - { - path: "README.md", - type: "blob", - sha: "abc123", - size: 1024, - url: "https://api.github.com/repos/owner/repo/git/blobs/abc123", - }, - { - path: "src/index.js", - type: "blob", - sha: "def456", - size: 512, - url: "https://api.github.com/repos/owner/repo/git/blobs/def456", - }, - { - path: "binary-file.png", - type: "blob", - sha: "ghi789", - size: 2048, - url: "https://api.github.com/repos/owner/repo/git/blobs/ghi789", - }, - ], - truncated: false, - }, - resolvedBranch: "main", - }); - - const result = await (strategy as any).processItem(item, options); - - expect(result.links).toEqual([ - "github-file://README.md", - "github-file://src/index.js", - ]); - expect(result.document).toBeUndefined(); - - // Clean up the spy - mockFetchRepositoryTree.mockRestore(); - }); - - it("should handle blob URL with file path", async () => { - const blobOptions = { - ...options, - url: "https://github.com/owner/repo/blob/main/README.md", - }; - const item = { url: "https://github.com/owner/repo/blob/main/README.md", depth: 0 }; - const result = await (strategy as any).processItem(item, blobOptions); - - expect(result.links).toEqual(["github-file://README.md"]); - expect(result.document).toBeUndefined(); - }); - - it("should handle blob URL without file path", async () => { - const blobOptions = { - ...options, - url: "https://github.com/owner/repo/blob/main", - }; - const item = { url: "https://github.com/owner/repo/blob/main", depth: 0 }; - const result = await (strategy as any).processItem(item, blobOptions); - - expect(result.links).toEqual([]); - expect(result.document).toBeUndefined(); - }); - - it("should process individual file content", async () => { - const rawContent: RawContent = { - content: "# Test File\nThis is a test markdown file.", - mimeType: "text/markdown", - source: "https://raw.githubusercontent.com/owner/repo/main/README.md", - charset: "utf-8", - }; - - const processedContent = { - textContent: "Test File\nThis is a test markdown file.", - metadata: { title: "Test File" }, - errors: [], - links: [], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - markdownPipelineInstance.canProcess.mockReturnValue(true); - markdownPipelineInstance.process.mockResolvedValue(processedContent); - - const item = { url: "github-file://README.md", depth: 1 }; - const result = await (strategy as any).processItem(item, options); - - expect(result.document).toEqual({ - content: "Test File\nThis is a test markdown file.", - contentType: "text/markdown", - metadata: { - url: "https://github.com/owner/repo/blob/main/README.md", - title: "Test File", - library: "test-lib", - version: "1.0.0", - }, - }); - expect(result.links).toEqual([]); - }); - - it("should use filename as title fallback when no title found", async () => { - const rawContent: RawContent = { - content: "Some content without title", - mimeType: "text/plain", - source: "https://raw.githubusercontent.com/owner/repo/main/config.txt", - charset: "utf-8", - }; - - const processedContent = { - textContent: "Some content without title", - metadata: { title: "" }, - errors: [], - links: [], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - markdownPipelineInstance.canProcess.mockReturnValue(true); - markdownPipelineInstance.process.mockResolvedValue(processedContent); - - const item = { url: "github-file://config.txt", depth: 1 }; - const result = await (strategy as any).processItem(item, options); - - expect(result.document?.metadata.title).toBe("config.txt"); - }); - - it("should handle unsupported content types", async () => { - const rawContent: RawContent = { - content: "binary content", - mimeType: "application/octet-stream", - source: "https://raw.githubusercontent.com/owner/repo/main/binary.bin", - charset: "utf-8", - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(false); - markdownPipelineInstance.canProcess.mockReturnValue(false); - - const item = { url: "github-file://binary.bin", depth: 1 }; - const result = await (strategy as any).processItem(item, options); - - expect(result.document).toBeUndefined(); - expect(result.links).toEqual([]); - }); - }); - - describe("shouldProcessFile", () => { - const options: ScraperOptions = { - url: "https://github.com/owner/repo", - library: "test-lib", - version: "1.0.0", - }; - - it("should process text files", () => { - const textFiles = [ - { path: "README.md", type: "blob" as const }, - { path: "src/index.js", type: "blob" as const }, - { path: "docs/guide.rst", type: "blob" as const }, - { path: "package.json", type: "blob" as const }, - { path: "Dockerfile", type: "blob" as const }, - // Note: LICENSE files are excluded by default patterns, so we don't test it here - ]; - - for (const file of textFiles) { - expect((strategy as any).shouldProcessFile(file, options)).toBe(true); - } - }); - - it("should skip binary files", () => { - const binaryFiles = [ - { path: "image.png", type: "blob" as const }, - { path: "video.mp4", type: "blob" as const }, - { path: "archive.zip", type: "blob" as const }, - { path: "binary.exe", type: "blob" as const }, - ]; - - for (const file of binaryFiles) { - expect((strategy as any).shouldProcessFile(file, options)).toBe(false); - } - }); - - it("should skip tree items", () => { - const treeItem = { path: "src", type: "tree" as const }; - expect((strategy as any).shouldProcessFile(treeItem, options)).toBe(false); - }); - - it("should respect include patterns", () => { - const optionsWithInclude = { - ...options, - includePatterns: ["*.md", "src/*"], - }; - - expect( - (strategy as any).shouldProcessFile( - { path: "README.md", type: "blob" as const }, - optionsWithInclude, - ), - ).toBe(true); - expect( - (strategy as any).shouldProcessFile( - { path: "src/index.js", type: "blob" as const }, - optionsWithInclude, - ), - ).toBe(true); - expect( - (strategy as any).shouldProcessFile( - { path: "package.json", type: "blob" as const }, - optionsWithInclude, - ), - ).toBe(false); - }); - - it("should respect exclude patterns", () => { - const optionsWithExclude = { - ...options, - excludePatterns: ["**/*.test.js", "node_modules/**"], - }; - - expect( - (strategy as any).shouldProcessFile( - { path: "src/index.js", type: "blob" as const }, - optionsWithExclude, - ), - ).toBe(true); - expect( - (strategy as any).shouldProcessFile( - { path: "src/index.test.js", type: "blob" as const }, - optionsWithExclude, - ), - ).toBe(false); - expect( - (strategy as any).shouldProcessFile( - { path: "node_modules/package/index.js", type: "blob" as const }, - optionsWithExclude, - ), - ).toBe(false); - }); - }); - - describe("cleanup", () => { - it("should cleanup pipeline resources", async () => { - await strategy.cleanup(); - expect(htmlPipelineInstance.close).toHaveBeenCalled(); - expect(markdownPipelineInstance.close).toHaveBeenCalled(); - }); - }); -}); diff --git a/src/scraper/strategies/GitHubRepoScraperStrategy.ts b/src/scraper/strategies/GitHubRepoScraperStrategy.ts deleted file mode 100644 index 1e52a0ad..00000000 --- a/src/scraper/strategies/GitHubRepoScraperStrategy.ts +++ /dev/null @@ -1,529 +0,0 @@ -import type { Document, ProgressCallback } from "../../types"; -import { logger } from "../../utils/logger"; -import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; -import { HttpFetcher } from "../fetcher"; -import type { RawContent } from "../fetcher/types"; -import { PipelineFactory } from "../pipelines/PipelineFactory"; -import type { ContentPipeline } from "../pipelines/types"; -import { ScrapeMode, type ScraperOptions, type ScraperProgress } from "../types"; -import { shouldIncludeUrl } from "../utils/patternMatcher"; -import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy"; - -interface GitHubRepoInfo { - owner: string; - repo: string; - branch?: string; - subPath?: string; -} - -interface GitHubTreeItem { - path: string; - type: "blob" | "tree"; - sha: string; - size?: number; - url: string; -} - -interface GitHubTreeResponse { - sha: string; - url: string; - tree: GitHubTreeItem[]; - truncated: boolean; -} - -/** - * GitHubRepoScraperStrategy handles native repository crawling by accessing GitHub's tree API - * to discover repository structure and fetching raw file contents. This treats repositories - * more like file systems rather than web pages. - * - * Features: - * - Uses GitHub tree API for efficient repository structure discovery - * - Fetches raw file contents from raw.githubusercontent.com - * - Processes all text files (source code, markdown, documentation, etc.) - * - Supports branch-specific crawling (defaults to main/default branch) - * - Automatically detects repository default branch when no branch specified - * - Respects repository subpath URLs (e.g., /tree//docs) by limiting indexed files - * - Filters out binary files and processes only text-based content - * - * Note: Wiki pages are not currently supported in this native mode. For wiki access, - * consider using the web scraping approach or a separate scraping job. - */ -export class GitHubRepoScraperStrategy extends BaseScraperStrategy { - private readonly httpFetcher = new HttpFetcher(); - private readonly pipelines: ContentPipeline[]; - private resolvedBranch?: string; // Cache the resolved default branch - - constructor() { - super(); - this.pipelines = PipelineFactory.createStandardPipelines(); - } - - canHandle(url: string): boolean { - const { hostname } = new URL(url); - return ["github.com", "www.github.com"].includes(hostname); - } - - /** - * Override shouldProcessUrl to handle github-file:// URLs specially. - * These URLs bypass scope checking since they're internal file references. - */ - protected shouldProcessUrl(url: string, options: ScraperOptions): boolean { - // For github-file:// URLs, only apply include/exclude patterns, skip scope checking - if (url.startsWith("github-file://")) { - const filePath = url.replace("github-file://", ""); - return shouldIncludeUrl(filePath, options.includePatterns, options.excludePatterns); - } - - // For regular URLs, use the base implementation - return super.shouldProcessUrl(url, options); - } - - /** - * Parses a GitHub URL to extract repository information. - */ - parseGitHubUrl(url: string): GitHubRepoInfo & { isBlob?: boolean; filePath?: string } { - const parsedUrl = new URL(url); - // Extract // from github.com///... - const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)/); - if (!match) { - throw new Error(`Invalid GitHub repository URL: ${url}`); - } - - const [, owner, repo] = match; - - // Extract branch and optional subpath from URLs like /tree// - const segments = parsedUrl.pathname.split("/").filter(Boolean); - - // Handle /blob/ URLs for single file indexing - if (segments.length >= 4 && segments[2] === "blob") { - const branch = segments[3]; - const filePath = segments.length > 4 ? segments.slice(4).join("/") : undefined; - return { owner, repo, branch, filePath, isBlob: true }; - } - - // Only handle URLs of the form /owner/repo/tree/branch/subPath - if (segments.length < 4 || segments[2] !== "tree") { - // Unsupported format (missing branch, or not a tree/blob URL) - return { owner, repo }; - } - - const branch = segments[3]; - const subPath = segments.length > 4 ? segments.slice(4).join("/") : undefined; - - return { owner, repo, branch, subPath }; - } - - /** - * Fetches the repository tree structure from GitHub API. - * Uses 'HEAD' to get the default branch if no branch is specified. - */ - async fetchRepositoryTree( - repoInfo: GitHubRepoInfo, - signal?: AbortSignal, - ): Promise<{ tree: GitHubTreeResponse; resolvedBranch: string }> { - const { owner, repo, branch } = repoInfo; - - // If no branch specified, fetch the default branch first - let targetBranch = branch; - if (!targetBranch) { - try { - // Get repository information to find the default branch - const repoUrl = `https://api.github.com/repos/${owner}/${repo}`; - logger.debug(`Fetching repository info: ${repoUrl}`); - - const repoContent = await this.httpFetcher.fetch(repoUrl, { signal }); - const content = - typeof repoContent.content === "string" - ? repoContent.content - : repoContent.content.toString("utf-8"); - const repoData = JSON.parse(content) as { default_branch: string }; - targetBranch = repoData.default_branch; - - logger.debug(`Using default branch: ${targetBranch}`); - } catch (error) { - logger.warn(`⚠️ Could not fetch default branch, using 'main': ${error}`); - targetBranch = "main"; - } - } - - // Cache the resolved branch for file fetching - this.resolvedBranch = targetBranch; - - const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`; - - logger.debug(`Fetching repository tree: ${treeUrl}`); - - const rawContent = await this.httpFetcher.fetch(treeUrl, { signal }); - const content = - typeof rawContent.content === "string" - ? rawContent.content - : rawContent.content.toString("utf-8"); - const treeData = JSON.parse(content) as GitHubTreeResponse; - - if (treeData.truncated) { - logger.warn( - `⚠️ Repository tree was truncated for ${owner}/${repo}. Some files may be missing.`, - ); - } - - return { tree: treeData, resolvedBranch: targetBranch }; - } - - /** - * Determines if a file should be processed based on its path and type. - */ - private shouldProcessFile(item: GitHubTreeItem, options: ScraperOptions): boolean { - // Only process blob (file) items, not trees (directories) - if (item.type !== "blob") { - return false; - } - - const path = item.path; - - // Whitelist of text-based file extensions that we can process - const textExtensions = [ - // Documentation - ".md", - ".mdx", - ".txt", - ".rst", - ".adoc", - ".asciidoc", - - // Web technologies - ".html", - ".htm", - ".xml", - ".css", - ".scss", - ".sass", - ".less", - - // Programming languages - ".js", - ".jsx", - ".ts", - ".tsx", - ".py", - ".java", - ".c", - ".cpp", - ".cc", - ".cxx", - ".h", - ".hpp", - ".cs", - ".go", - ".rs", - ".rb", - ".php", - ".swift", - ".kt", - ".scala", - ".clj", - ".cljs", - ".hs", - ".elm", - ".dart", - ".r", - ".m", - ".mm", - ".sh", - ".bash", - ".zsh", - ".fish", - ".ps1", - ".bat", - ".cmd", - - // Configuration and data - ".json", - ".yaml", - ".yml", - ".toml", - ".ini", - ".cfg", - ".conf", - ".properties", - ".env", - ".gitignore", - ".dockerignore", - ".gitattributes", - ".editorconfig", - - // Build and package management - ".gradle", - ".pom", - ".sbt", - ".maven", - ".cmake", - ".make", - ".dockerfile", - ".mod", // Go modules (go.mod) - ".sum", // Go checksums (go.sum) - - // Other text formats - ".sql", - ".graphql", - ".gql", - ".proto", - ".thrift", - ".avro", - ".csv", - ".tsv", - ".log", - ]; - - const pathLower = path.toLowerCase(); - - // Check for known text extensions - const hasTextExtension = textExtensions.some((ext) => pathLower.endsWith(ext)); - - // Check for compound extensions and special cases - const hasCompoundExtension = - pathLower.includes(".env.") || // .env.example, .env.local, etc. - pathLower.endsWith(".env") || - pathLower.includes(".config.") || // webpack.config.js, etc. - pathLower.includes(".lock"); // package-lock.json, etc. - - // Also include files without extensions that are commonly text files - const fileName = path.split("/").pop() || ""; - const fileNameLower = fileName.toLowerCase(); - const commonTextFiles = [ - // Documentation files without extensions - "readme", - "license", - "changelog", - "contributing", - "authors", - "maintainers", - - // Build files without extensions - "dockerfile", - "makefile", - "rakefile", - "gemfile", - "podfile", - "cartfile", - "brewfile", - "procfile", - "vagrantfile", - "gulpfile", - "gruntfile", - - // Configuration files (dotfiles) - ".prettierrc", - ".eslintrc", - ".babelrc", - ".nvmrc", - ".npmrc", - ]; - - const isCommonTextFile = commonTextFiles.some((name) => { - if (name.startsWith(".")) { - // For dotfiles, match exactly or with additional extension (e.g., .prettierrc.js) - return fileNameLower === name || fileNameLower.startsWith(`${name}.`); - } - // For regular files, match exactly or with extension - return fileNameLower === name || fileNameLower.startsWith(`${name}.`); - }); - - // Process file if it has a text extension, compound extension, or is a common text file - if (!hasTextExtension && !hasCompoundExtension && !isCommonTextFile) { - return false; - } - - // Apply user-defined include/exclude patterns (use the file path directly) - return shouldIncludeUrl(path, options.includePatterns, options.excludePatterns); - } - - /** - * Fetches the raw content of a file from GitHub. - */ - async fetchFileContent( - repoInfo: GitHubRepoInfo, - filePath: string, - signal?: AbortSignal, - ): Promise { - const { owner, repo } = repoInfo; - // Use resolved branch if available, otherwise use provided branch or default to main - const branch = this.resolvedBranch || repoInfo.branch || "main"; - const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${filePath}`; - - const rawContent = await this.httpFetcher.fetch(rawUrl, { signal }); - - // Override GitHub's generic 'text/plain' MIME type with file extension-based detection - const detectedMimeType = MimeTypeUtils.detectMimeTypeFromPath(filePath); - if (detectedMimeType && rawContent.mimeType === "text/plain") { - return { - ...rawContent, - mimeType: detectedMimeType, - }; - } - - return rawContent; - } - - protected async processItem( - item: QueueItem, - options: ScraperOptions, - _progressCallback?: ProgressCallback, - signal?: AbortSignal, - ): Promise<{ document?: Document; links?: string[] }> { - // Parse the URL to get repository information - const repoInfo = this.parseGitHubUrl(options.url); - - // For the initial item, handle blob URLs differently than tree URLs - if (item.depth === 0) { - // Handle single file (blob) URLs - if ("isBlob" in repoInfo && repoInfo.isBlob) { - if (repoInfo.filePath) { - logger.info( - `📄 Processing single file: ${repoInfo.owner}/${repoInfo.repo}/${repoInfo.filePath}`, - ); - - // Process the single file directly - return { links: [`github-file://${repoInfo.filePath}`] }; - } else { - // Blob URL without file path - return empty links - logger.warn( - `⚠️ Blob URL without file path: ${options.url}. No files to process.`, - ); - return { links: [] }; - } - } - - // Handle repository tree crawling (existing logic) - logger.info( - `🗂️ Discovering repository structure for ${repoInfo.owner}/${repoInfo.repo}`, - ); - - const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal); - const fileItems = tree.tree - .filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)) - .filter((treeItem) => this.shouldProcessFile(treeItem, options)); - - logger.info( - `📁 Found ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`, - ); - - // Convert tree items to URLs for the queue - const links = fileItems.map((treeItem) => `github-file://${treeItem.path}`); - - return { links }; - } - - // Process individual files - if (item.url.startsWith("github-file://")) { - const filePath = item.url.replace("github-file://", ""); - - logger.info( - `🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`, - ); - - const rawContent = await this.fetchFileContent(repoInfo, filePath, signal); - - // Process content through appropriate pipeline - let processed: Awaited> | undefined; - - for (const pipeline of this.pipelines) { - if (pipeline.canProcess(rawContent)) { - logger.debug( - `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`, - ); - - // Force 'fetch' mode for GitHub to avoid unnecessary Playwright usage on raw content. - // GitHub raw files (e.g., HTML files) don't have their dependencies available at the - // raw.githubusercontent.com domain, so rendering them in a browser would be broken - // and provide no additional value over direct HTML parsing with Cheerio. - const gitHubOptions = { ...options, scrapeMode: ScrapeMode.Fetch }; - - processed = await pipeline.process(rawContent, gitHubOptions, this.httpFetcher); - break; - } - } - - if (!processed) { - logger.warn( - `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`, - ); - return { document: undefined, links: [] }; - } - - for (const err of processed.errors) { - logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`); - } - - // Create document with GitHub-specific metadata - const githubUrl = `https://github.com/${repoInfo.owner}/${repoInfo.repo}/blob/${this.resolvedBranch || repoInfo.branch || "main"}/${filePath}`; - - // Use filename as fallback if title is empty or not a string - const processedTitle = processed.metadata.title; - const hasValidTitle = - typeof processedTitle === "string" && processedTitle.trim() !== ""; - const fallbackTitle = filePath.split("/").pop() || "Untitled"; - - return { - document: { - content: typeof processed.textContent === "string" ? processed.textContent : "", - metadata: { - url: githubUrl, - title: hasValidTitle ? processedTitle : fallbackTitle, - library: options.library, - version: options.version, - }, - contentType: rawContent.mimeType, // Preserve the detected MIME type - } satisfies Document, - links: [], // Always return empty links array for individual files - }; - } - - return { document: undefined, links: [] }; - } - - /** - * Normalize a path by removing leading and trailing slashes. - */ - private normalizePath(path: string): string { - return path.replace(/^\/+/, "").replace(/\/+$/, ""); - } - - private isWithinSubPath(path: string, subPath?: string): boolean { - if (!subPath) { - return true; - } - - const trimmedSubPath = this.normalizePath(subPath); - if (trimmedSubPath.length === 0) { - return true; - } - - const normalizedPath = this.normalizePath(path); - if (normalizedPath === trimmedSubPath) { - return true; - } - - return normalizedPath.startsWith(`${trimmedSubPath}/`); - } - - async scrape( - options: ScraperOptions, - progressCallback: ProgressCallback, - signal?: AbortSignal, - ): Promise { - // Validate it's a GitHub URL - const url = new URL(options.url); - if (!url.hostname.includes("github.com")) { - throw new Error("URL must be a GitHub URL"); - } - - return super.scrape(options, progressCallback, signal); - } - - /** - * Cleanup resources used by this strategy, specifically the pipeline browser instances. - */ - async cleanup(): Promise { - await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close())); - } -} diff --git a/src/scraper/strategies/GitHubScraperStrategy.test.ts b/src/scraper/strategies/GitHubScraperStrategy.test.ts index 24bf66ff..5a283239 100644 --- a/src/scraper/strategies/GitHubScraperStrategy.test.ts +++ b/src/scraper/strategies/GitHubScraperStrategy.test.ts @@ -1,38 +1,25 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; -import { GitHubRepoScraperStrategy } from "./GitHubRepoScraperStrategy"; +import { FetchStatus, HttpFetcher } from "../fetcher"; +import type { ScraperOptions } from "../types"; import { GitHubScraperStrategy } from "./GitHubScraperStrategy"; -import { GitHubWikiScraperStrategy } from "./GitHubWikiScraperStrategy"; -// Mock the underlying strategies -vi.mock("./GitHubRepoScraperStrategy"); -vi.mock("./GitHubWikiScraperStrategy"); +// Mock the dependencies +vi.mock("../fetcher"); -const mockRepoStrategy = vi.mocked(GitHubRepoScraperStrategy); -const mockWikiStrategy = vi.mocked(GitHubWikiScraperStrategy); +const mockHttpFetcher = vi.mocked(HttpFetcher); describe("GitHubScraperStrategy", () => { let strategy: GitHubScraperStrategy; - let repoStrategyInstance: any; - let wikiStrategyInstance: any; + let httpFetcherInstance: any; beforeEach(() => { vi.clearAllMocks(); - // Setup repo strategy mock - repoStrategyInstance = { - canHandle: vi.fn(), - scrape: vi.fn(), - cleanup: vi.fn(), + // Setup fetcher mock + httpFetcherInstance = { + fetch: vi.fn(), }; - mockRepoStrategy.mockImplementation(() => repoStrategyInstance); - - // Setup wiki strategy mock - wikiStrategyInstance = { - canHandle: vi.fn(), - scrape: vi.fn(), - cleanup: vi.fn(), - }; - mockWikiStrategy.mockImplementation(() => wikiStrategyInstance); + mockHttpFetcher.mockImplementation(() => httpFetcherInstance); strategy = new GitHubScraperStrategy(); }); @@ -40,18 +27,24 @@ describe("GitHubScraperStrategy", () => { describe("canHandle", () => { it("should handle base GitHub repository URLs", () => { expect(strategy.canHandle("https://github.com/owner/repo")).toBe(true); - expect(strategy.canHandle("https://github.com/owner/repo/")).toBe(true); expect(strategy.canHandle("https://www.github.com/owner/repo")).toBe(true); + expect(strategy.canHandle("https://github.com/owner/repo/")).toBe(true); }); - it("should not handle GitHub URLs with specific paths", () => { - expect(strategy.canHandle("https://github.com/owner/repo/wiki")).toBe(false); - expect(strategy.canHandle("https://github.com/owner/repo/wiki/Home")).toBe(false); - expect(strategy.canHandle("https://github.com/owner/repo/tree/main")).toBe(false); + it("should handle tree URLs with branch", () => { + expect(strategy.canHandle("https://github.com/owner/repo/tree/main")).toBe(true); + expect(strategy.canHandle("https://github.com/owner/repo/tree/develop/src")).toBe( + true, + ); + }); + + it("should handle blob URLs with file paths", () => { expect( strategy.canHandle("https://github.com/owner/repo/blob/main/README.md"), - ).toBe(false); - expect(strategy.canHandle("https://github.com/owner/repo/issues")).toBe(false); + ).toBe(true); + expect( + strategy.canHandle("https://github.com/owner/repo/blob/main/src/index.js"), + ).toBe(true); }); it("should not handle non-GitHub URLs", () => { @@ -60,105 +53,350 @@ describe("GitHubScraperStrategy", () => { expect(strategy.canHandle("https://example.com")).toBe(false); }); - it("should not handle invalid URLs", () => { - expect(strategy.canHandle("invalid-url")).toBe(false); - expect(strategy.canHandle("")).toBe(false); + it("should not handle GitHub wiki URLs", () => { + expect(strategy.canHandle("https://github.com/owner/repo/wiki")).toBe(false); + expect(strategy.canHandle("https://github.com/owner/repo/wiki/Page")).toBe(false); }); - }); - // Note: shouldProcessUrl is a protected method that delegates to underlying strategies, - // but it's mainly used internally. The most important behavior is tested via the scrape() method. - - describe("scrape", () => { - it("should orchestrate both repo and wiki scraping", async () => { - const options = { - url: "https://github.com/owner/repo", - library: "test-lib", - version: "1.0.0", - }; + it("should not handle other GitHub paths", () => { + expect(strategy.canHandle("https://github.com/owner/repo/issues")).toBe(false); + expect(strategy.canHandle("https://github.com/owner/repo/pulls")).toBe(false); + }); + }); - const progressCallback = vi.fn(); + describe("parseGitHubUrl", () => { + it("should parse basic repository URL", () => { + const result = (strategy as any).parseGitHubUrl("https://github.com/owner/repo"); + expect(result).toEqual({ owner: "owner", repo: "repo" }); + }); - repoStrategyInstance.scrape.mockResolvedValue(undefined); - wikiStrategyInstance.scrape.mockResolvedValue(undefined); + it("should parse tree URL with branch", () => { + const result = (strategy as any).parseGitHubUrl( + "https://github.com/owner/repo/tree/main", + ); + expect(result).toEqual({ owner: "owner", repo: "repo", branch: "main" }); + }); - await strategy.scrape(options, progressCallback); + it("should parse tree URL with branch and subpath", () => { + const result = (strategy as any).parseGitHubUrl( + "https://github.com/owner/repo/tree/main/docs", + ); + expect(result).toEqual({ + owner: "owner", + repo: "repo", + branch: "main", + subPath: "docs", + }); + }); - // Should scrape wiki first (prioritized) - expect(wikiStrategyInstance.scrape).toHaveBeenCalledWith( - expect.objectContaining({ - ...options, - url: "https://github.com/owner/repo/wiki", - }), - expect.any(Function), - undefined, + it("should parse blob URL with file", () => { + const result = (strategy as any).parseGitHubUrl( + "https://github.com/owner/repo/blob/main/README.md", ); + expect(result).toEqual({ + owner: "owner", + repo: "repo", + branch: "main", + filePath: "README.md", + isBlob: true, + }); + }); - // Should then scrape repository with adjusted maxPages - expect(repoStrategyInstance.scrape).toHaveBeenCalledWith( - expect.objectContaining({ - ...options, - maxPages: 1000, // Default maxPages since no wiki pages were scraped in mock - }), - expect.any(Function), - undefined, + it("should parse blob URL with nested file path", () => { + const result = (strategy as any).parseGitHubUrl( + "https://github.com/owner/repo/blob/main/src/index.js", ); + expect(result).toEqual({ + owner: "owner", + repo: "repo", + branch: "main", + filePath: "src/index.js", + isBlob: true, + }); }); - it("should handle wiki scraping failure gracefully", async () => { - const options = { - url: "https://github.com/owner/repo", - library: "test-lib", - version: "1.0.0", - }; + it("should throw error for invalid repository URL", () => { + expect(() => { + (strategy as any).parseGitHubUrl("https://github.com/invalid"); + }).toThrow("Invalid GitHub repository URL"); + }); + }); + + describe("shouldProcessFile", () => { + const options: ScraperOptions = { + url: "https://github.com/owner/repo", + library: "test-lib", + version: "1.0.0", + }; - const progressCallback = vi.fn(); + it("should process text files with common extensions", () => { + const textFiles = [ + { path: "README.md", type: "blob" as const }, + { path: "src/index.js", type: "blob" as const }, + { path: "docs/guide.rst", type: "blob" as const }, + { path: "package.json", type: "blob" as const }, + { path: "config.yaml", type: "blob" as const }, + { path: "script.py", type: "blob" as const }, + ]; - repoStrategyInstance.scrape.mockResolvedValue(undefined); - wikiStrategyInstance.scrape.mockRejectedValue(new Error("Wiki not found")); + for (const file of textFiles) { + // @ts-expect-error Accessing private method for testing + expect(strategy.shouldProcessFile(file, options)).toBe(true); + } + }); - // Should not throw error when wiki fails - await expect(strategy.scrape(options, progressCallback)).resolves.toBeUndefined(); + it("should process common text files without extensions", () => { + const commonFiles = [ + { path: "Dockerfile", type: "blob" as const }, + { path: "Makefile", type: "blob" as const }, + { path: "README", type: "blob" as const }, + { path: "CHANGELOG", type: "blob" as const }, + ]; - expect(repoStrategyInstance.scrape).toHaveBeenCalled(); - expect(wikiStrategyInstance.scrape).toHaveBeenCalled(); + for (const file of commonFiles) { + // @ts-expect-error Accessing private method for testing + expect(strategy.shouldProcessFile(file, options)).toBe(true); + } }); - it("should validate GitHub URLs", async () => { - const options = { - url: "https://example.com/owner/repo", - library: "test-lib", - version: "1.0.0", - }; + it("should process config files", () => { + const configFiles = [ + { path: ".prettierrc", type: "blob" as const }, + { path: ".eslintrc", type: "blob" as const }, + { path: ".babelrc", type: "blob" as const }, + { path: ".env", type: "blob" as const }, + { path: ".env.local", type: "blob" as const }, + ]; - const progressCallback = vi.fn(); + for (const file of configFiles) { + // @ts-expect-error Accessing private method for testing + expect(strategy.shouldProcessFile(file, options)).toBe(true); + } + }); - await expect(strategy.scrape(options, progressCallback)).rejects.toThrow( - "URL must be a GitHub URL", - ); + it("should skip binary files", () => { + const binaryFiles = [ + { path: "image.png", type: "blob" as const }, + { path: "video.mp4", type: "blob" as const }, + { path: "archive.zip", type: "blob" as const }, + { path: "binary.exe", type: "blob" as const }, + { path: "lib.so", type: "blob" as const }, + { path: "app.dmg", type: "blob" as const }, + ]; + + for (const file of binaryFiles) { + // @ts-expect-error Accessing private method for testing + expect(strategy.shouldProcessFile(file, options)).toBe(false); + } + }); + + it("should skip tree items (directories)", () => { + const treeItem = { path: "src", type: "tree" as const }; + // @ts-expect-error Accessing private method for testing + expect(strategy.shouldProcessFile(treeItem, options)).toBe(false); }); - it("should validate repository URL format", async () => { - const options = { - url: "https://github.com/owner/repo/tree/main", - library: "test-lib", - version: "1.0.0", + it("should respect include patterns", () => { + const optionsWithInclude = { + ...options, + includePatterns: ["*.md", "src/**"], }; - const progressCallback = vi.fn(); + expect( + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { path: "README.md", type: "blob" as const, sha: "abc", url: "" }, + optionsWithInclude, + ), + ).toBe(true); + expect( + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { path: "src/index.js", type: "blob" as const, sha: "def", url: "" }, + optionsWithInclude, + ), + ).toBe(true); + expect( + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { path: "package.json", type: "blob" as const, sha: "ghi", url: "" }, + optionsWithInclude, + ), + ).toBe(false); + }); - await expect(strategy.scrape(options, progressCallback)).rejects.toThrow( - "URL must be a base GitHub repository URL", - ); + it("should respect exclude patterns", () => { + const optionsWithExclude = { + ...options, + excludePatterns: ["**/*.test.js", "node_modules/**"], + }; + + expect( + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { path: "src/index.js", type: "blob" as const, sha: "abc", url: "" }, + optionsWithExclude, + ), + ).toBe(true); + expect( + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { path: "src/index.test.js", type: "blob" as const, sha: "def", url: "" }, + optionsWithExclude, + ), + ).toBe(false); + expect( + // @ts-expect-error Accessing private method for testing + strategy.shouldProcessFile( + { + path: "node_modules/package/index.js", + type: "blob" as const, + sha: "ghi", + url: "", + }, + optionsWithExclude, + ), + ).toBe(false); + }); + }); + + describe("isWithinSubPath", () => { + it("should return true when no subPath is specified", () => { + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("any/path", undefined)).toBe(true); + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("any/path", "")).toBe(true); + }); + + it("should return true for exact subPath match", () => { + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("docs", "docs")).toBe(true); + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("src/lib", "src/lib")).toBe(true); + }); + + it("should return true for paths within subPath", () => { + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("docs/guide.md", "docs")).toBe(true); + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("src/lib/index.js", "src/lib")).toBe(true); + }); + + it("should return false for paths outside subPath", () => { + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("README.md", "docs")).toBe(false); + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("src/index.js", "docs")).toBe(false); + }); + + it("should handle trailing slashes correctly", () => { + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("docs/guide.md", "docs/")).toBe(true); + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("docs/guide.md", "/docs")).toBe(true); + // @ts-expect-error Accessing private method for testing + expect(strategy.isWithinSubPath("docs/guide.md", "/docs/")).toBe(true); }); }); - describe("cleanup", () => { - it("should cleanup both underlying strategies", async () => { - await strategy.cleanup(); + describe("processItem", () => { + const options: ScraperOptions = { + url: "https://github.com/owner/repo", + library: "test-lib", + version: "1.0.0", + }; + + beforeEach(() => { + // Mock default branch fetch + httpFetcherInstance.fetch.mockImplementation((url: string) => { + if (url.includes("api.github.com/repos/") && !url.includes("/git/trees/")) { + return Promise.resolve({ + content: JSON.stringify({ default_branch: "main" }), + mimeType: "application/json", + source: url, + charset: "utf-8", + status: FetchStatus.SUCCESS, + }); + } + if (url.includes("/git/trees/")) { + return Promise.resolve({ + content: JSON.stringify({ + sha: "tree123", + url: "https://api.github.com/repos/owner/repo/git/trees/tree123", + tree: [ + { + path: "README.md", + type: "blob", + sha: "abc123", + size: 1024, + url: "...", + }, + { + path: "src/index.js", + type: "blob", + sha: "def456", + size: 512, + url: "...", + }, + { + path: "image.png", + type: "blob", + sha: "ghi789", + size: 2048, + url: "...", + }, + ], + truncated: false, + }), + mimeType: "application/json", + source: url, + charset: "utf-8", + status: FetchStatus.SUCCESS, + }); + } + return Promise.resolve({ + content: "file content", + mimeType: "text/plain", + source: url, + charset: "utf-8", + status: FetchStatus.SUCCESS, + }); + }); + }); + + it("should discover files and return HTTPS blob URLs", async () => { + const item = { url: "https://github.com/owner/repo", depth: 0 }; + const result = await strategy.processItem(item, options); + + expect(result.status).toBe(FetchStatus.SUCCESS); + expect(result.links).toContain("https://github.com/owner/repo/blob/main/README.md"); + expect(result.links).toContain( + "https://github.com/owner/repo/blob/main/src/index.js", + ); + expect(result.links).not.toContain( + "https://github.com/owner/repo/blob/main/image.png", + ); + }); + + it("should return empty links for non-depth-0 items", async () => { + const item = { url: "https://github.com/owner/repo", depth: 1 }; + const result = await strategy.processItem(item, options); + + expect(result.status).toBe(FetchStatus.SUCCESS); + expect(result.links).toEqual([]); + }); + + it("should handle single blob file URLs with strict scoping", async () => { + const blobOptions = { + ...options, + url: "https://github.com/owner/repo/blob/main/README.md", + }; + const item = { url: "https://github.com/owner/repo/blob/main/README.md", depth: 0 }; + const result = await strategy.processItem(item, blobOptions); - expect(repoStrategyInstance.cleanup).toHaveBeenCalled(); - expect(wikiStrategyInstance.cleanup).toHaveBeenCalled(); + expect(result.status).toBe(FetchStatus.SUCCESS); + // Strict scoping: blob URL should index ONLY that file, not discover wiki + expect(result.links).toEqual(["https://github.com/owner/repo/blob/main/README.md"]); }); }); }); diff --git a/src/scraper/strategies/GitHubScraperStrategy.ts b/src/scraper/strategies/GitHubScraperStrategy.ts index 55df19aa..72589da5 100644 --- a/src/scraper/strategies/GitHubScraperStrategy.ts +++ b/src/scraper/strategies/GitHubScraperStrategy.ts @@ -1,15 +1,27 @@ +import mime from "mime"; import type { ProgressCallback } from "../../types"; import { logger } from "../../utils/logger"; -import type { ScraperOptions, ScraperProgress, ScraperStrategy } from "../types"; -import { GitHubRepoScraperStrategy } from "./GitHubRepoScraperStrategy"; -import { GitHubWikiScraperStrategy } from "./GitHubWikiScraperStrategy"; +import { HttpFetcher } from "../fetcher"; +import { FetchStatus } from "../fetcher/types"; +import type { QueueItem, ScraperOptions, ScraperProgressEvent } from "../types"; +import { shouldIncludeUrl } from "../utils/patternMatcher"; +import { BaseScraperStrategy, type ProcessItemResult } from "./BaseScraperStrategy"; +import type { + GitHubRepoInfo, + GitHubTreeItem, + GitHubTreeResponse, +} from "./GitHubRepoProcessor"; +import { GitHubRepoProcessor } from "./GitHubRepoProcessor"; +import { GitHubWikiProcessor } from "./GitHubWikiProcessor"; /** - * GitHubScraperStrategy is a composite strategy that orchestrates the scraping of both + * GitHubScraperStrategy is a discovery strategy that orchestrates the scraping of both * GitHub repository code and wiki pages. When given a GitHub repository URL, it will: * - * 1. Attempt to scrape the repository's wiki pages using GitHubWikiScraperStrategy (prioritized) - * 2. Scrape the repository's code files using GitHubRepoScraperStrategy (with remaining page budget) + * 1. Attempt to scrape the repository's wiki pages using GitHubWikiProcessor (prioritized) + * 2. Discover all repository files using the GitHub Tree API + * 3. Create HTTPS blob URLs for each file, which are stored in the database + * 4. Process blob URLs directly with GitHubRepoProcessor * * This provides comprehensive documentation coverage by including both wiki documentation * and source code in a single scraping job, with wikis prioritized as they typically @@ -17,131 +29,425 @@ import { GitHubWikiScraperStrategy } from "./GitHubWikiScraperStrategy"; * * Features: * - Handles base GitHub repository URLs (e.g., https://github.com/owner/repo) + * - Handles branch-specific URLs (e.g., https://github.com/owner/repo/tree/branch) + * - Handles single file URLs (e.g., https://github.com/owner/repo/blob/branch/path) + * - Discovers all files efficiently using GitHub's Tree API + * - Generates and processes user-friendly HTTPS blob URLs throughout * - Prioritizes wiki content over repository files for better documentation quality * - Respects maxPages limit across both scraping phases to prevent exceeding quotas * - Automatically discovers and scrapes both wiki and code content - * - Merges progress reporting from both sub-strategies * - Graceful handling when wikis don't exist or are inaccessible - * - Maintains all the capabilities of both underlying strategies */ -export class GitHubScraperStrategy implements ScraperStrategy { - private readonly repoStrategy = new GitHubRepoScraperStrategy(); - private readonly wikiStrategy = new GitHubWikiScraperStrategy(); +export class GitHubScraperStrategy extends BaseScraperStrategy { + private readonly httpFetcher = new HttpFetcher(); + private readonly wikiProcessor = new GitHubWikiProcessor(); + private readonly repoProcessor = new GitHubRepoProcessor(); canHandle(url: string): boolean { try { const parsedUrl = new URL(url); const { hostname, pathname } = parsedUrl; - // Only handle base GitHub repository URLs, not specific paths like /wiki/, /blob/, /tree/ + // Handle GitHub repository URLs if (!["github.com", "www.github.com"].includes(hostname)) { return false; } - // Check if it's a base repository URL (owner/repo format) - const pathMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/); - return pathMatch !== null; + // Handle base repository URLs (owner/repo) + const baseMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/?$/); + if (baseMatch) { + return true; + } + + // Handle tree URLs (owner/repo/tree/branch/...) + const treeMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/tree\//); + if (treeMatch) { + return true; + } + + // Handle blob URLs (owner/repo/blob/branch/...) + const blobMatch = pathname.match(/^\/([^/]+)\/([^/]+)\/blob\//); + if (blobMatch) { + return true; + } + + return false; } catch { return false; } } - async scrape( - options: ScraperOptions, - progressCallback: ProgressCallback, - signal?: AbortSignal, - ): Promise { - // Validate it's a GitHub URL - const url = new URL(options.url); - if (!url.hostname.includes("github.com")) { - throw new Error("URL must be a GitHub URL"); + /** + * Parses a GitHub URL to extract repository information. + */ + private parseGitHubUrl( + url: string, + ): GitHubRepoInfo & { isBlob?: boolean; filePath?: string } { + const parsedUrl = new URL(url); + // Extract // from github.com///... + const match = parsedUrl.pathname.match(/^\/([^/]+)\/([^/]+)/); + if (!match) { + throw new Error(`Invalid GitHub repository URL: ${url}`); } - // Parse the repository information - const pathMatch = url.pathname.match(/^\/([^/]+)\/([^/]+)\/?$/); - if (!pathMatch) { - throw new Error("URL must be a base GitHub repository URL"); - } - - const [, owner, repo] = pathMatch; - logger.info(`🚀 Starting comprehensive GitHub scraping for ${owner}/${repo}`); - - // We'll track progress from both strategies and merge them - let totalPagesDiscovered = 0; - let wikiPagesScraped = 0; - let wikiCompleted = false; - let repoCompleted = false; - - const mergedProgressCallback: ProgressCallback = async ( - progress, - ) => { - // For the first strategy (wiki), accumulate discovered pages and scraped count - if (!wikiCompleted) { - totalPagesDiscovered = progress.totalDiscovered; - wikiPagesScraped = progress.pagesScraped; - } else if (!repoCompleted) { - // For the second strategy (repo), create cumulative progress - progress = { - ...progress, - pagesScraped: wikiPagesScraped + progress.pagesScraped, - totalPages: wikiPagesScraped + progress.totalPages, - totalDiscovered: totalPagesDiscovered + progress.totalDiscovered, - }; - } + const [, owner, repo] = match; - // Report the progress as-is and await completion - await progressCallback(progress); - }; + // Extract branch and optional subpath from URLs like /tree// + const segments = parsedUrl.pathname.split("/").filter(Boolean); - try { - // First, attempt to scrape the wiki (prioritized for better documentation) - const wikiUrl = `${options.url.replace(/\/$/, "")}/wiki`; - const wikiOptions = { ...options, url: wikiUrl }; + // Handle /blob/ URLs for single file indexing + if (segments.length >= 4 && segments[2] === "blob") { + const branch = segments[3]; + const filePath = segments.length > 4 ? segments.slice(4).join("/") : undefined; + return { owner, repo, branch, filePath, isBlob: true }; + } + + // Handle /tree/ URLs with branch and optional subpath + if (segments.length >= 4 && segments[2] === "tree") { + const branch = segments[3]; + const subPath = segments.length > 4 ? segments.slice(4).join("/") : undefined; + return { owner, repo, branch, subPath }; + } - logger.info(`📖 Attempting to scrape wiki for ${owner}/${repo}`); + // Base repository URL + return { owner, repo }; + } + + /** + * Fetches the repository tree structure from GitHub API. + */ + private async fetchRepositoryTree( + repoInfo: GitHubRepoInfo, + signal?: AbortSignal, + ): Promise<{ tree: GitHubTreeResponse; resolvedBranch: string }> { + const { owner, repo, branch } = repoInfo; + // If no branch specified, fetch the default branch first + let targetBranch = branch; + if (!targetBranch) { try { - // Check if the wiki exists by trying to access it - await this.wikiStrategy.scrape(wikiOptions, mergedProgressCallback, signal); - wikiCompleted = true; - logger.info( - `✅ Completed wiki scraping for ${owner}/${repo} (${wikiPagesScraped} pages)`, - ); + const repoUrl = `https://api.github.com/repos/${owner}/${repo}`; + logger.debug(`Fetching repository info: ${repoUrl}`); + + const repoContent = await this.httpFetcher.fetch(repoUrl, { signal }); + const content = + typeof repoContent.content === "string" + ? repoContent.content + : repoContent.content.toString("utf-8"); + const repoData = JSON.parse(content) as { default_branch: string }; + targetBranch = repoData.default_branch; + + logger.debug(`Using default branch: ${targetBranch}`); } catch (error) { - wikiCompleted = true; - logger.info(`ℹ️ Wiki not available or accessible for ${owner}/${repo}: ${error}`); - // Don't throw - wiki not existing is not a failure condition + logger.warn(`⚠️ Could not fetch default branch, using 'main': ${error}`); + targetBranch = "main"; + } + } + + const treeUrl = `https://api.github.com/repos/${owner}/${repo}/git/trees/${targetBranch}?recursive=1`; + logger.debug(`Fetching repository tree: ${treeUrl}`); + + const rawContent = await this.httpFetcher.fetch(treeUrl, { signal }); + const content = + typeof rawContent.content === "string" + ? rawContent.content + : rawContent.content.toString("utf-8"); + const treeData = JSON.parse(content) as GitHubTreeResponse; + + if (treeData.truncated) { + logger.warn( + `⚠️ Repository tree was truncated for ${owner}/${repo}. Some files may be missing.`, + ); + } + + return { tree: treeData, resolvedBranch: targetBranch }; + } + + /** + * Determines if a file should be processed based on its path and type. + */ + private shouldProcessFile(item: GitHubTreeItem, options: ScraperOptions): boolean { + if (item.type !== "blob") { + return false; + } + + const path = item.path; + + // Whitelist of text-based file extensions + const textExtensions = [ + ".md", + ".mdx", + ".txt", + ".rst", + ".adoc", + ".asciidoc", + ".html", + ".htm", + ".xml", + ".css", + ".scss", + ".sass", + ".less", + ".js", + ".jsx", + ".ts", + ".tsx", + ".py", + ".java", + ".c", + ".cpp", + ".cc", + ".cxx", + ".h", + ".hpp", + ".cs", + ".go", + ".rs", + ".rb", + ".php", + ".swift", + ".kt", + ".scala", + ".clj", + ".cljs", + ".hs", + ".elm", + ".dart", + ".r", + ".m", + ".mm", + ".sh", + ".bash", + ".zsh", + ".fish", + ".ps1", + ".bat", + ".cmd", + ".json", + ".yaml", + ".yml", + ".toml", + ".ini", + ".cfg", + ".conf", + ".properties", + ".env", + ".gitignore", + ".dockerignore", + ".gitattributes", + ".editorconfig", + ".gradle", + ".pom", + ".sbt", + ".maven", + ".cmake", + ".make", + ".dockerfile", + ".mod", + ".sum", + ".sql", + ".graphql", + ".gql", + ".proto", + ".thrift", + ".avro", + ".csv", + ".tsv", + ".log", + ]; + + const pathLower = path.toLowerCase(); + const hasTextExtension = textExtensions.some((ext) => pathLower.endsWith(ext)); + const hasCompoundExtension = + pathLower.includes(".env.") || + pathLower.endsWith(".env") || + pathLower.includes(".config.") || + pathLower.includes(".lock"); + + const fileName = path.split("/").pop() || ""; + const fileNameLower = fileName.toLowerCase(); + const commonTextFiles = [ + "readme", + "license", + "changelog", + "contributing", + "authors", + "maintainers", + "dockerfile", + "makefile", + "rakefile", + "gemfile", + "podfile", + "cartfile", + "brewfile", + "procfile", + "vagrantfile", + "gulpfile", + "gruntfile", + ".prettierrc", + ".eslintrc", + ".babelrc", + ".nvmrc", + ".npmrc", + ]; + + const isCommonTextFile = commonTextFiles.some((name) => { + if (name.startsWith(".")) { + return fileNameLower === name || fileNameLower.startsWith(`${name}.`); } + return fileNameLower === name || fileNameLower.startsWith(`${name}.`); + }); - // Then, scrape the repository code with adjusted page limit - const maxPages = options.maxPages || 1000; - const remainingPages = Math.max(0, maxPages - wikiPagesScraped); + // If file passes known checks, include it + if (hasTextExtension || hasCompoundExtension || isCommonTextFile) { + return shouldIncludeUrl(path, options.includePatterns, options.excludePatterns); + } + + // Fallback: check if unknown extension has text/* MIME type + const mimeType = mime.getType(path); + if (mimeType?.startsWith("text/")) { + logger.debug(`Including file with text MIME type: ${path} (${mimeType})`); + return shouldIncludeUrl(path, options.includePatterns, options.excludePatterns); + } - if (remainingPages > 0) { - logger.info( - `📂 Scraping repository code for ${owner}/${repo} (${remainingPages} pages remaining)`, + // Not a text file + return false; + } + + /** + * Checks if a path is within the specified subpath. + */ + private isWithinSubPath(path: string, subPath?: string): boolean { + if (!subPath) { + return true; + } + + const trimmedSubPath = subPath.replace(/^\/+/, "").replace(/\/+$/, ""); + if (trimmedSubPath.length === 0) { + return true; + } + + const normalizedPath = path.replace(/^\/+/, "").replace(/\/+$/, ""); + if (normalizedPath === trimmedSubPath) { + return true; + } + + return normalizedPath.startsWith(`${trimmedSubPath}/`); + } + + async processItem( + item: QueueItem, + options: ScraperOptions, + signal?: AbortSignal, + ): Promise { + // Delegate to wiki processor for wiki URLs + // Use precise pattern matching: /owner/repo/wiki or /owner/repo/wiki/ + try { + const parsedUrl = new URL(item.url); + if (/^\/[^/]+\/[^/]+\/wiki($|\/)/.test(parsedUrl.pathname)) { + return await this.wikiProcessor.process(item, options, signal); + } + } catch { + // If URL parsing fails, fall through to other handlers + } + + // For the main repository URL (depth 0), perform discovery + // This includes blob URLs at depth 0, which should return themselves as discovered links + if (item.depth === 0) { + const repoInfo = this.parseGitHubUrl(options.url); + const { owner, repo } = repoInfo; + + logger.debug(`Discovering GitHub repository ${owner}/${repo}`); + + const discoveredLinks: string[] = []; + + // Handle single file (blob) URLs - strict scoping: index ONLY the file + if ("isBlob" in repoInfo && repoInfo.isBlob && repoInfo.filePath) { + const { branch = "main", filePath } = repoInfo; + logger.debug( + `Single file URL detected: ${owner}/${repo}/${filePath} - indexing file only`, ); - const repoOptions = { ...options, maxPages: remainingPages }; - await this.repoStrategy.scrape(repoOptions, mergedProgressCallback, signal); - repoCompleted = true; - logger.info(`✅ Completed repository code scraping for ${owner}/${repo}`); - } else { - logger.info( - `ℹ️ Skipping repository code scraping - page limit reached with wiki content`, + + // Generate HTTPS blob URL for storage + discoveredLinks.push( + `https://github.com/${owner}/${repo}/blob/${branch}/${filePath}`, ); + + return { + url: item.url, + links: discoveredLinks, + status: FetchStatus.SUCCESS, + }; } - logger.info(`🎉 Comprehensive GitHub scraping completed for ${owner}/${repo}`); + // Discover wiki URL for full repo scrapes (will be processed by GitHubWikiScraperStrategy) + const wikiUrl = `${options.url.replace(/\/$/, "")}/wiki`; + discoveredLinks.push(wikiUrl); + logger.debug(`Discovered wiki URL: ${wikiUrl}`); + + // 3. Discover all files in the repository + const { tree, resolvedBranch } = await this.fetchRepositoryTree(repoInfo, signal); + + const fileItems = tree.tree + .filter((treeItem) => this.isWithinSubPath(treeItem.path, repoInfo.subPath)) + .filter((treeItem) => this.shouldProcessFile(treeItem, options)); + + logger.debug( + `Discovered ${fileItems.length} processable files in repository (branch: ${resolvedBranch})`, + ); + + // Create HTTPS blob URLs for storage in database + // These are user-friendly, clickable URLs that work outside the system + const fileUrls = fileItems.map( + (treeItem) => + `https://github.com/${owner}/${repo}/blob/${resolvedBranch}/${treeItem.path}`, + ); + + discoveredLinks.push(...fileUrls); + + logger.debug( + `Discovery complete: ${fileUrls.length} repo file(s) + 1 wiki URL = ${discoveredLinks.length} total URLs`, + ); + + return { url: item.url, links: discoveredLinks, status: FetchStatus.SUCCESS }; + } + + // Handle HTTPS blob URLs at depth > 0 (from database during refresh or discovered files) + // Process blob URLs directly - fetch content and return empty links + // Use precise pattern matching: /owner/repo/blob/branch/path + try { + const parsedUrl = new URL(item.url); + if (/^\/[^/]+\/[^/]+\/blob\//.test(parsedUrl.pathname)) { + logger.debug(`Processing HTTPS blob URL at depth ${item.depth}: ${item.url}`); + return await this.repoProcessor.process(item, options, signal); + } } catch (error) { - logger.error(`❌ GitHub scraping failed for ${owner}/${repo}: ${error}`); - throw error; + logger.warn(`⚠️ Failed to parse blob URL ${item.url}: ${error}`); + return { url: item.url, links: [], status: FetchStatus.SUCCESS }; } + + // For any other URLs at non-zero depth, return empty (shouldn't happen in practice) + logger.debug(`No further processing for URL at depth ${item.depth}: ${item.url}`); + return { url: item.url, links: [], status: FetchStatus.SUCCESS }; + } + + async scrape( + options: ScraperOptions, + progressCallback: ProgressCallback, + signal?: AbortSignal, + ): Promise { + const url = new URL(options.url); + if (!url.hostname.includes("github.com")) { + throw new Error("URL must be a GitHub URL"); + } + + // Use the base class implementation which handles initialQueue properly + // The processItem method will discover all wiki and repo file URLs + // The base scraper will automatically deduplicate URLs from initialQueue + await super.scrape(options, progressCallback, signal); } - /** - * Cleanup resources used by both underlying strategies. - */ async cleanup(): Promise { - await Promise.allSettled([this.repoStrategy.cleanup(), this.wikiStrategy.cleanup()]); + await Promise.all([this.wikiProcessor.cleanup(), this.repoProcessor.cleanup()]); } } diff --git a/src/scraper/strategies/GitHubWikiScraperStrategy.ts b/src/scraper/strategies/GitHubWikiProcessor.ts similarity index 56% rename from src/scraper/strategies/GitHubWikiScraperStrategy.ts rename to src/scraper/strategies/GitHubWikiProcessor.ts index 59b3e7d2..de0cb6e4 100644 --- a/src/scraper/strategies/GitHubWikiScraperStrategy.ts +++ b/src/scraper/strategies/GitHubWikiProcessor.ts @@ -1,11 +1,12 @@ -import type { Document, ProgressCallback } from "../../types"; import { logger } from "../../utils/logger"; import { HttpFetcher } from "../fetcher"; +import { FetchStatus } from "../fetcher/types"; import { PipelineFactory } from "../pipelines/PipelineFactory"; -import type { ContentPipeline } from "../pipelines/types"; -import { ScrapeMode, type ScraperOptions, type ScraperProgress } from "../types"; +import type { ContentPipeline, PipelineResult } from "../pipelines/types"; +import type { QueueItem } from "../types"; +import { ScrapeMode, type ScraperOptions } from "../types"; import { shouldIncludeUrl } from "../utils/patternMatcher"; -import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy"; +import type { ProcessItemResult } from "./BaseScraperStrategy"; interface GitHubWikiInfo { owner: string; @@ -13,7 +14,7 @@ interface GitHubWikiInfo { } /** - * GitHubWikiScraperStrategy handles scraping GitHub wiki pages using standard web scraping techniques. + * GitHubWikiProcessor handles scraping GitHub wiki pages using standard web scraping techniques. * GitHub wikis are separate from the main repository and are hosted at /wiki/ URLs. * * Features: @@ -22,34 +23,16 @@ interface GitHubWikiInfo { * - Processes wiki content as HTML/Markdown pages * - Stays within the wiki scope to avoid crawling the entire repository * - * Note: This strategy is specifically for /wiki/ URLs and does not handle regular repository files. + * This processor is stateless and contains the core logic from GitHubWikiScraperStrategy. */ -export class GitHubWikiScraperStrategy extends BaseScraperStrategy { +export class GitHubWikiProcessor { private readonly httpFetcher = new HttpFetcher(); private readonly pipelines: ContentPipeline[]; constructor() { - super(); this.pipelines = PipelineFactory.createStandardPipelines(); } - canHandle(url: string): boolean { - try { - const parsedUrl = new URL(url); - const { hostname, pathname } = parsedUrl; - - // Check if it's a GitHub URL and contains /wiki/ - // This should handle specific wiki URLs like /owner/repo/wiki/PageName - return ( - ["github.com", "www.github.com"].includes(hostname) && - pathname.includes("/wiki") && - pathname.match(/^\/([^/]+)\/([^/]+)\/wiki/) !== null - ); - } catch { - return false; - } - } - /** * Parses a GitHub wiki URL to extract repository information. */ @@ -66,15 +49,17 @@ export class GitHubWikiScraperStrategy extends BaseScraperStrategy { } /** - * Override shouldProcessUrl to only process URLs within the wiki scope. + * Determines if a URL should be processed within the wiki scope. */ - protected shouldProcessUrl(url: string, options: ScraperOptions): boolean { + shouldProcessUrl(url: string, options: ScraperOptions): boolean { try { const parsedUrl = new URL(url); - const wikiInfo = this.parseGitHubWikiUrl(options.url); - const expectedWikiPath = `/${wikiInfo.owner}/${wikiInfo.repo}/wiki`; - // Only process URLs that are within the same wiki + // Get the expected repository info from the base URL + const baseWikiInfo = this.parseGitHubWikiUrl(options.url); + const expectedWikiPath = `/${baseWikiInfo.owner}/${baseWikiInfo.repo}/wiki`; + + // Check if the URL is within the same wiki if (!parsedUrl.pathname.startsWith(expectedWikiPath)) { return false; } @@ -93,27 +78,33 @@ export class GitHubWikiScraperStrategy extends BaseScraperStrategy { } } - protected async processItem( + /** + * Processes a single GitHub wiki page. + */ + async process( item: QueueItem, options: ScraperOptions, - _progressCallback?: ProgressCallback, signal?: AbortSignal, - ): Promise<{ document?: Document; links?: string[] }> { + ): Promise { const currentUrl = item.url; - logger.info( - `📖 Processing wiki page ${this.pageCount}/${options.maxPages}: ${currentUrl}`, - ); - try { - // Fetch the wiki page content - const rawContent = await this.httpFetcher.fetch(currentUrl, { signal }); + // Fetch the wiki page content with ETag for conditional requests + const rawContent = await this.httpFetcher.fetch(currentUrl, { + signal, + etag: item.etag, + }); + + // Return the status directly - BaseScraperStrategy handles NOT_MODIFIED and NOT_FOUND + if (rawContent.status !== FetchStatus.SUCCESS) { + return { url: currentUrl, links: [], status: rawContent.status }; + } // Process content through appropriate pipeline - let processed: Awaited> | undefined; + let processed: PipelineResult | undefined; for (const pipeline of this.pipelines) { - if (pipeline.canProcess(rawContent)) { + if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) { logger.debug( `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${currentUrl})`, ); @@ -130,10 +121,10 @@ export class GitHubWikiScraperStrategy extends BaseScraperStrategy { logger.warn( `⚠️ Unsupported content type "${rawContent.mimeType}" for wiki page ${currentUrl}. Skipping processing.`, ); - return { document: undefined, links: [] }; + return { url: currentUrl, links: [], status: FetchStatus.SUCCESS }; } - for (const err of processed.errors) { + for (const err of processed.errors ?? []) { logger.warn(`⚠️ Processing error for ${currentUrl}: ${err.message}`); } @@ -145,22 +136,6 @@ export class GitHubWikiScraperStrategy extends BaseScraperStrategy { .replace(/^\//, ""); const pageTitle = wikiPagePath || "Home"; - // Create document with wiki-specific metadata - const document: Document = { - content: typeof processed.textContent === "string" ? processed.textContent : "", - metadata: { - url: currentUrl, - title: - typeof processed.metadata.title === "string" && - processed.metadata.title.trim() !== "" - ? processed.metadata.title - : pageTitle, - library: options.library, - version: options.version, - }, - contentType: rawContent.mimeType, - }; - // Extract links from the processed content const links = processed.links || []; @@ -200,41 +175,24 @@ export class GitHubWikiScraperStrategy extends BaseScraperStrategy { } }); - return { document, links: wikiLinks }; + return { + url: currentUrl, + title: pageTitle, + etag: rawContent.etag, + lastModified: rawContent.lastModified, + contentType: rawContent.mimeType, + content: processed, + links: wikiLinks, + status: FetchStatus.SUCCESS, + }; } catch (error) { logger.warn(`⚠️ Failed to process wiki page ${currentUrl}: ${error}`); - return { document: undefined, links: [] }; + return { url: currentUrl, links: [], status: FetchStatus.SUCCESS }; } } - async scrape( - options: ScraperOptions, - progressCallback: ProgressCallback, - signal?: AbortSignal, - ): Promise { - // Validate it's a GitHub wiki URL - const url = new URL(options.url); - if (!url.hostname.includes("github.com") || !url.pathname.includes("/wiki")) { - throw new Error("URL must be a GitHub wiki URL"); - } - - // Ensure the starting URL points to the wiki home if no specific page is provided - let startUrl = options.url; - if (url.pathname.endsWith("/wiki") || url.pathname.endsWith("/wiki/")) { - // If the URL just points to /wiki/, start from the Home page - startUrl = url.pathname.endsWith("/") - ? `${options.url}Home` - : `${options.url}/Home`; - } - - // Update options with the corrected start URL - const wikiOptions = { ...options, url: startUrl }; - - return super.scrape(wikiOptions, progressCallback, signal); - } - /** - * Cleanup resources used by this strategy. + * Cleanup resources used by this processor. */ async cleanup(): Promise { await Promise.allSettled(this.pipelines.map((pipeline) => pipeline.close())); diff --git a/src/scraper/strategies/GitHubWikiScraperStrategy.test.ts b/src/scraper/strategies/GitHubWikiScraperStrategy.test.ts deleted file mode 100644 index 868fc30b..00000000 --- a/src/scraper/strategies/GitHubWikiScraperStrategy.test.ts +++ /dev/null @@ -1,688 +0,0 @@ -import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; -import { HttpFetcher } from "../fetcher"; -import type { RawContent } from "../fetcher/types"; -import { HtmlPipeline } from "../pipelines/HtmlPipeline"; -import { MarkdownPipeline } from "../pipelines/MarkdownPipeline"; -import { ScrapeMode, type ScraperOptions } from "../types"; -import { GitHubWikiScraperStrategy } from "./GitHubWikiScraperStrategy"; - -// Mock the fetcher and pipelines -vi.mock("../fetcher"); -vi.mock("../pipelines/HtmlPipeline"); -vi.mock("../pipelines/MarkdownPipeline"); - -const mockHttpFetcher = vi.mocked(HttpFetcher); -const mockHtmlPipeline = vi.mocked(HtmlPipeline); -const mockMarkdownPipeline = vi.mocked(MarkdownPipeline); - -describe("GitHubWikiScraperStrategy", () => { - let strategy: GitHubWikiScraperStrategy; - let httpFetcherInstance: any; - let htmlPipelineInstance: any; - let markdownPipelineInstance: any; - - beforeEach(() => { - // Reset all mocks - vi.clearAllMocks(); - - // Setup fetcher mock - httpFetcherInstance = { - fetch: vi.fn(), - }; - mockHttpFetcher.mockImplementation(() => httpFetcherInstance); - - // Setup pipeline mocks - htmlPipelineInstance = { - canProcess: vi.fn(), - process: vi.fn(), - }; - markdownPipelineInstance = { - canProcess: vi.fn(), - process: vi.fn(), - }; - mockHtmlPipeline.mockImplementation(() => htmlPipelineInstance); - mockMarkdownPipeline.mockImplementation(() => markdownPipelineInstance); - - strategy = new GitHubWikiScraperStrategy(); - }); - - afterEach(() => { - vi.restoreAllMocks(); - }); - - describe("canHandle", () => { - it("should handle GitHub wiki URLs", () => { - expect(strategy.canHandle("https://github.com/owner/repo/wiki")).toBe(true); - expect(strategy.canHandle("https://github.com/owner/repo/wiki/")).toBe(true); - expect(strategy.canHandle("https://github.com/owner/repo/wiki/Home")).toBe(true); - expect( - strategy.canHandle("https://github.com/owner/repo/wiki/Getting-Started"), - ).toBe(true); - expect(strategy.canHandle("https://www.github.com/owner/repo/wiki/API")).toBe(true); - }); - - it("should not handle non-wiki GitHub URLs", () => { - expect(strategy.canHandle("https://github.com/owner/repo")).toBe(false); - expect(strategy.canHandle("https://github.com/owner/repo/tree/main")).toBe(false); - expect( - strategy.canHandle("https://github.com/owner/repo/blob/main/README.md"), - ).toBe(false); - expect(strategy.canHandle("https://github.com/owner/repo/issues")).toBe(false); - }); - - it("should not handle non-GitHub URLs", () => { - expect(strategy.canHandle("https://example.com/wiki")).toBe(false); - expect(strategy.canHandle("https://gitlab.com/owner/repo/wiki")).toBe(false); - expect(strategy.canHandle("https://bitbucket.org/owner/repo/wiki")).toBe(false); - }); - - it("should handle malformed URLs gracefully", () => { - expect(strategy.canHandle("invalid-url")).toBe(false); - expect(strategy.canHandle("")).toBe(false); - expect(strategy.canHandle("not-a-url-at-all")).toBe(false); - }); - }); - - describe("parseGitHubWikiUrl", () => { - it("should parse basic wiki URL", () => { - const result = (strategy as any).parseGitHubWikiUrl( - "https://github.com/owner/repo/wiki", - ); - expect(result).toEqual({ - owner: "owner", - repo: "repo", - }); - }); - - it("should parse wiki URL with trailing slash", () => { - const result = (strategy as any).parseGitHubWikiUrl( - "https://github.com/owner/repo/wiki/", - ); - expect(result).toEqual({ - owner: "owner", - repo: "repo", - }); - }); - - it("should parse wiki URL with specific page", () => { - const result = (strategy as any).parseGitHubWikiUrl( - "https://github.com/owner/repo/wiki/Home", - ); - expect(result).toEqual({ - owner: "owner", - repo: "repo", - }); - }); - - it("should parse wiki URL with complex page name", () => { - const result = (strategy as any).parseGitHubWikiUrl( - "https://github.com/owner/repo/wiki/Getting-Started-Guide", - ); - expect(result).toEqual({ - owner: "owner", - repo: "repo", - }); - }); - - it("should handle www subdomain", () => { - const result = (strategy as any).parseGitHubWikiUrl( - "https://www.github.com/owner/repo/wiki", - ); - expect(result).toEqual({ - owner: "owner", - repo: "repo", - }); - }); - - it("should throw error for invalid wiki URL", () => { - expect(() => { - (strategy as any).parseGitHubWikiUrl("https://github.com/invalid"); - }).toThrow("Invalid GitHub wiki URL"); - - expect(() => { - (strategy as any).parseGitHubWikiUrl("https://github.com/owner/repo"); - }).toThrow("Invalid GitHub wiki URL"); - }); - }); - - describe("shouldProcessUrl", () => { - const options: ScraperOptions = { - url: "https://github.com/owner/repo/wiki", - library: "test-lib", - version: "1.0.0", - }; - - it("should process URLs within the same wiki", () => { - expect( - (strategy as any).shouldProcessUrl( - "https://github.com/owner/repo/wiki/Home", - options, - ), - ).toBe(true); - expect( - (strategy as any).shouldProcessUrl( - "https://github.com/owner/repo/wiki/API", - options, - ), - ).toBe(true); - expect( - (strategy as any).shouldProcessUrl( - "https://github.com/owner/repo/wiki/Getting-Started", - options, - ), - ).toBe(true); - }); - - it("should not process URLs outside the wiki", () => { - expect( - (strategy as any).shouldProcessUrl("https://github.com/owner/repo", options), - ).toBe(false); - expect( - (strategy as any).shouldProcessUrl( - "https://github.com/owner/repo/tree/main", - options, - ), - ).toBe(false); - expect( - (strategy as any).shouldProcessUrl( - "https://github.com/other/repo/wiki/Home", - options, - ), - ).toBe(false); - }); - - it("should respect include patterns", () => { - const optionsWithInclude = { - ...options, - includePatterns: ["API*", "Getting*"], - }; - - expect( - (strategy as any).shouldProcessUrl( - "https://github.com/owner/repo/wiki/API-Reference", - optionsWithInclude, - ), - ).toBe(true); - expect( - (strategy as any).shouldProcessUrl( - "https://github.com/owner/repo/wiki/Getting-Started", - optionsWithInclude, - ), - ).toBe(true); - expect( - (strategy as any).shouldProcessUrl( - "https://github.com/owner/repo/wiki/Home", - optionsWithInclude, - ), - ).toBe(false); - }); - - it("should respect exclude patterns", () => { - const optionsWithExclude = { - ...options, - excludePatterns: ["*deprecated*", "old-*"], - }; - - expect( - (strategy as any).shouldProcessUrl( - "https://github.com/owner/repo/wiki/deprecated-api", - optionsWithExclude, - ), - ).toBe(false); - expect( - (strategy as any).shouldProcessUrl( - "https://github.com/owner/repo/wiki/old-guide", - optionsWithExclude, - ), - ).toBe(false); - expect( - (strategy as any).shouldProcessUrl( - "https://github.com/owner/repo/wiki/current-guide", - optionsWithExclude, - ), - ).toBe(true); - }); - - it("should handle Home page as default", () => { - expect( - (strategy as any).shouldProcessUrl("https://github.com/owner/repo/wiki", options), - ).toBe(true); - expect( - (strategy as any).shouldProcessUrl( - "https://github.com/owner/repo/wiki/", - options, - ), - ).toBe(true); - }); - - it("should handle malformed URLs gracefully", () => { - expect((strategy as any).shouldProcessUrl("invalid-url", options)).toBe(false); - expect((strategy as any).shouldProcessUrl("", options)).toBe(false); - }); - }); - - describe("processItem", () => { - const options: ScraperOptions = { - url: "https://github.com/owner/repo/wiki", - library: "test-lib", - version: "1.0.0", - }; - - it("should process wiki page and return document with links", async () => { - const rawContent: RawContent = { - content: ` - - - Wiki Home - -

Welcome to the Wiki

-

This is the home page of our documentation.

- - - - `, - mimeType: "text/html", - source: "https://github.com/owner/repo/wiki/Home", - charset: "utf-8", - }; - - const processedContent = { - textContent: - "Wiki Home\n\nWelcome to the Wiki\n\nThis is the home page of our documentation.", - metadata: { title: "Wiki Home" }, - errors: [], - links: [ - "/owner/repo/wiki/API", - "/owner/repo/wiki/Getting-Started", - "https://external.com", - ], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(true); - htmlPipelineInstance.process.mockResolvedValue(processedContent); - - const item = { url: "https://github.com/owner/repo/wiki/Home", depth: 1 }; - const result = await (strategy as any).processItem(item, options); - - expect(result.document).toEqual({ - content: - "Wiki Home\n\nWelcome to the Wiki\n\nThis is the home page of our documentation.", - contentType: "text/html", - metadata: { - url: "https://github.com/owner/repo/wiki/Home", - title: "Wiki Home", - library: "test-lib", - version: "1.0.0", - }, - }); - - // Should only include wiki links, not external links - expect(result.links).toEqual([ - "https://github.com/owner/repo/wiki/API", - "https://github.com/owner/repo/wiki/Getting-Started", - ]); - }); - - it("should use page name as title fallback when no title found", async () => { - const rawContent: RawContent = { - content: "

Content without title

", - mimeType: "text/html", - source: "https://github.com/owner/repo/wiki/Getting-Started", - charset: "utf-8", - }; - - const processedContent = { - textContent: "Content without title", - metadata: { title: "" }, - errors: [], - links: [], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(true); - htmlPipelineInstance.process.mockResolvedValue(processedContent); - - const item = { - url: "https://github.com/owner/repo/wiki/Getting-Started", - depth: 1, - }; - const result = await (strategy as any).processItem(item, options); - - expect(result.document?.metadata.title).toBe("Getting-Started"); - }); - - it("should handle Home page title fallback", async () => { - const rawContent: RawContent = { - content: "

Home page content

", - mimeType: "text/html", - source: "https://github.com/owner/repo/wiki", - charset: "utf-8", - }; - - const processedContent = { - textContent: "Home page content", - metadata: { title: "" }, - errors: [], - links: [], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(true); - htmlPipelineInstance.process.mockResolvedValue(processedContent); - - const item = { url: "https://github.com/owner/repo/wiki", depth: 1 }; - const result = await (strategy as any).processItem(item, options); - - expect(result.document?.metadata.title).toBe("Home"); - }); - - it("should force ScrapeMode.Fetch for consistent behavior", async () => { - const rawContent: RawContent = { - content: "

Test

", - mimeType: "text/html", - source: "https://github.com/owner/repo/wiki/Test", - charset: "utf-8", - }; - - const processedContent = { - textContent: "Test", - metadata: { title: "Test" }, - errors: [], - links: [], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(true); - htmlPipelineInstance.process.mockImplementation( - async (_content: any, opts: any) => { - expect(opts.scrapeMode).toBe("fetch"); - return processedContent; - }, - ); - - const optionsWithPlaywright = { - ...options, - scrapeMode: ScrapeMode.Playwright, - }; - - const item = { url: "https://github.com/owner/repo/wiki/Test", depth: 1 }; - await (strategy as any).processItem(item, optionsWithPlaywright); - - expect(htmlPipelineInstance.process).toHaveBeenCalledWith( - rawContent, - expect.objectContaining({ scrapeMode: "fetch" }), - expect.any(Object), - ); - }); - - it("should handle unsupported content types", async () => { - const rawContent: RawContent = { - content: "binary content", - mimeType: "application/octet-stream", - source: "https://github.com/owner/repo/wiki/Binary", - charset: "utf-8", - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(false); - markdownPipelineInstance.canProcess.mockReturnValue(false); - - const item = { url: "https://github.com/owner/repo/wiki/Binary", depth: 1 }; - const result = await (strategy as any).processItem(item, options); - - expect(result.document).toBeUndefined(); - expect(result.links).toEqual([]); - }); - - it("should handle fetch errors gracefully", async () => { - httpFetcherInstance.fetch.mockRejectedValue(new Error("Network error")); - - const item = { url: "https://github.com/owner/repo/wiki/Unreachable", depth: 1 }; - const result = await (strategy as any).processItem(item, options); - - expect(result.document).toBeUndefined(); - expect(result.links).toEqual([]); - }); - - it("should handle processing errors from pipelines", async () => { - const rawContent: RawContent = { - content: "

Test

", - mimeType: "text/html", - source: "https://github.com/owner/repo/wiki/Test", - charset: "utf-8", - }; - - const processedContentWithErrors = { - textContent: "Test", - metadata: { title: "Test" }, - errors: [new Error("Processing warning")], - links: [], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(true); - htmlPipelineInstance.process.mockResolvedValue(processedContentWithErrors); - - const item = { url: "https://github.com/owner/repo/wiki/Test", depth: 1 }; - const result = await (strategy as any).processItem(item, options); - - expect(result.document).toBeDefined(); - expect(result.document?.content).toBe("Test"); - }); - }); - - describe("scrape", () => { - it("should validate GitHub wiki URL", async () => { - const invalidOptions: ScraperOptions = { - url: "https://example.com/wiki", - library: "test-lib", - version: "1.0.0", - }; - - await expect(strategy.scrape(invalidOptions, vi.fn())).rejects.toThrow( - "URL must be a GitHub wiki URL", - ); - }); - - it("should validate GitHub URL without wiki path", async () => { - const invalidOptions: ScraperOptions = { - url: "https://github.com/owner/repo", - library: "test-lib", - version: "1.0.0", - }; - - await expect(strategy.scrape(invalidOptions, vi.fn())).rejects.toThrow( - "URL must be a GitHub wiki URL", - ); - }); - - it("should append /Home to bare wiki URLs", async () => { - const options: ScraperOptions = { - url: "https://github.com/owner/repo/wiki", - library: "test-lib", - version: "1.0.0", - }; - - // Mock super.scrape to capture the options passed to it - const superScrapeSpy = vi.spyOn( - Object.getPrototypeOf(Object.getPrototypeOf(strategy)), - "scrape", - ); - superScrapeSpy.mockResolvedValue(undefined); - - await strategy.scrape(options, vi.fn()); - - expect(superScrapeSpy).toHaveBeenCalledWith( - expect.objectContaining({ - url: "https://github.com/owner/repo/wiki/Home", - }), - expect.any(Function), - undefined, - ); - - superScrapeSpy.mockRestore(); - }); - - it("should append /Home to wiki URLs with trailing slash", async () => { - const options: ScraperOptions = { - url: "https://github.com/owner/repo/wiki/", - library: "test-lib", - version: "1.0.0", - }; - - const superScrapeSpy = vi.spyOn( - Object.getPrototypeOf(Object.getPrototypeOf(strategy)), - "scrape", - ); - superScrapeSpy.mockResolvedValue(undefined); - - await strategy.scrape(options, vi.fn()); - - expect(superScrapeSpy).toHaveBeenCalledWith( - expect.objectContaining({ - url: "https://github.com/owner/repo/wiki/Home", - }), - expect.any(Function), - undefined, - ); - - superScrapeSpy.mockRestore(); - }); - - it("should not modify URLs that already point to specific pages", async () => { - const options: ScraperOptions = { - url: "https://github.com/owner/repo/wiki/Getting-Started", - library: "test-lib", - version: "1.0.0", - }; - - const superScrapeSpy = vi.spyOn( - Object.getPrototypeOf(Object.getPrototypeOf(strategy)), - "scrape", - ); - superScrapeSpy.mockResolvedValue(undefined); - - await strategy.scrape(options, vi.fn()); - - expect(superScrapeSpy).toHaveBeenCalledWith( - expect.objectContaining({ - url: "https://github.com/owner/repo/wiki/Getting-Started", - }), - expect.any(Function), - undefined, - ); - - superScrapeSpy.mockRestore(); - }); - }); - - describe("Link filtering and URL normalization", () => { - const options: ScraperOptions = { - url: "https://github.com/owner/repo/wiki", - library: "test-lib", - version: "1.0.0", - }; - - it("should convert relative links to absolute URLs", async () => { - const rawContent: RawContent = { - content: ` - - API Docs - Getting Started - Advanced - - `, - mimeType: "text/html", - source: "https://github.com/owner/repo/wiki/Home", - charset: "utf-8", - }; - - const processedContent = { - textContent: "Content", - metadata: { title: "Test" }, - errors: [], - links: ["/owner/repo/wiki/API", "Getting-Started", "./Advanced-Topics"], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(true); - htmlPipelineInstance.process.mockResolvedValue(processedContent); - - const item = { url: "https://github.com/owner/repo/wiki/Home", depth: 1 }; - const result = await (strategy as any).processItem(item, options); - - expect(result.links).toEqual([ - "https://github.com/owner/repo/wiki/API", - "https://github.com/owner/repo/wiki/Getting-Started", - "https://github.com/owner/repo/wiki/Advanced-Topics", - ]); - }); - - it("should filter out non-wiki links", async () => { - const rawContent: RawContent = { - content: "Content", - mimeType: "text/html", - source: "https://github.com/owner/repo/wiki/Home", - charset: "utf-8", - }; - - const processedContent = { - textContent: "Content", - metadata: { title: "Test" }, - errors: [], - links: [ - "https://github.com/owner/repo/wiki/API", // Should include - "https://github.com/owner/repo", // Should exclude (not wiki) - "https://github.com/other/repo/wiki/Home", // Should exclude (different repo) - "https://external.com/wiki", // Should exclude (external domain) - "mailto:test@example.com", // Should exclude (different protocol) - ], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(true); - htmlPipelineInstance.process.mockResolvedValue(processedContent); - - const item = { url: "https://github.com/owner/repo/wiki/Home", depth: 1 }; - const result = await (strategy as any).processItem(item, options); - - expect(result.links).toEqual(["https://github.com/owner/repo/wiki/API"]); - }); - - it("should handle malformed URLs in links gracefully", async () => { - const rawContent: RawContent = { - content: "Content", - mimeType: "text/html", - source: "https://github.com/owner/repo/wiki/Home", - charset: "utf-8", - }; - - const processedContent = { - textContent: "Content", - metadata: { title: "Test" }, - errors: [], - links: [ - "invalid-url", - "https://github.com/owner/repo/wiki/Valid", - "", - "not-a-url-at-all", - ], - }; - - httpFetcherInstance.fetch.mockResolvedValue(rawContent); - htmlPipelineInstance.canProcess.mockReturnValue(true); - htmlPipelineInstance.process.mockResolvedValue(processedContent); - - const item = { url: "https://github.com/owner/repo/wiki/Home", depth: 1 }; - const result = await (strategy as any).processItem(item, options); - - // Should only include the valid wiki link - expect(result.links).toEqual(["https://github.com/owner/repo/wiki/Valid"]); - }); - }); -}); diff --git a/src/scraper/strategies/LocalFileStrategy.test.ts b/src/scraper/strategies/LocalFileStrategy.test.ts index e871ea0e..dda0b5d6 100644 --- a/src/scraper/strategies/LocalFileStrategy.test.ts +++ b/src/scraper/strategies/LocalFileStrategy.test.ts @@ -1,6 +1,7 @@ import { vol } from "memfs"; import { beforeEach, describe, expect, it, vi } from "vitest"; -import type { ScraperOptions } from "../types"; +import type { ProgressCallback } from "../../types"; +import type { ScrapeResult, ScraperOptions, ScraperProgressEvent } from "../types"; import { LocalFileStrategy } from "./LocalFileStrategy"; vi.mock("node:fs/promises", () => ({ default: vol.promises })); @@ -27,7 +28,7 @@ describe("LocalFileStrategy", () => { maxPages: 1, maxDepth: 0, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); vol.fromJSON( { @@ -39,25 +40,37 @@ describe("LocalFileStrategy", () => { await strategy.scrape(options, progressCallback); expect(progressCallback).toHaveBeenCalledTimes(1); - expect(progressCallback).toHaveBeenCalledWith( - expect.objectContaining({ - pagesScraped: 1, - currentUrl: "file:///test.md", - depth: 0, - maxDepth: 0, - totalPages: 1, - document: { - content: "# Test\n\nThis is a test file.", - contentType: "text/markdown", - metadata: { - url: "file:///test.md", - title: "Test", - library: "test", - version: "1.0", + + const firstCall = progressCallback.mock.calls[0][0]; + expect(firstCall).toMatchObject({ + pagesScraped: 1, + currentUrl: "file:///test.md", + depth: 0, + maxDepth: 0, + totalPages: 1, + totalDiscovered: 1, + pageId: undefined, + result: { + textContent: "# Test\n\nThis is a test file.", + contentType: "text/markdown", + url: "file:///test.md", + title: "Test", + links: [], + errors: [], + chunks: [ + { + content: "# Test\nThis is a test file.", // content is simplified + section: { + level: 1, + path: ["Test"], + }, + types: ["heading", "text"], }, - }, - }), - ); + ], + }, + } satisfies Partial); + expect(firstCall.result?.etag).toBeDefined(); + expect(firstCall.result?.lastModified).toBeDefined(); }); it("should process a directory with files and a subdirectory", async () => { @@ -69,7 +82,7 @@ describe("LocalFileStrategy", () => { maxPages: 10, maxDepth: 2, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); vol.fromJSON( { @@ -96,7 +109,7 @@ describe("LocalFileStrategy", () => { maxDepth: 1, maxConcurrency: 1, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); vol.fromJSON( { "/testdir/file1.md": "# File 1", @@ -108,7 +121,7 @@ describe("LocalFileStrategy", () => { ); await strategy.scrape(options, progressCallback); - // All 3 files are processed: file1.md, file2.html, and file3.txt (as markdown) + // All 3 files are page: file1.md, file2.html, and file3.txt (as markdown) expect(progressCallback).toHaveBeenCalledTimes(3); // Validate .md @@ -120,16 +133,14 @@ describe("LocalFileStrategy", () => { depth: 1, maxDepth: 1, totalPages: 4, - document: expect.objectContaining({ - content: "# File 1", - metadata: expect.objectContaining({ - url: "file:///testdir/file1.md", - title: "File 1", - library: "test", - version: "1.0", - }), - }), - }), + totalDiscovered: 4, + result: expect.objectContaining({ + textContent: "# File 1", + contentType: "text/markdown", + url: "file:///testdir/file1.md", + title: "File 1", + } satisfies Partial), + } satisfies Partial), ); // Validate .html expect(progressCallback).toHaveBeenNthCalledWith( @@ -140,16 +151,14 @@ describe("LocalFileStrategy", () => { depth: 1, maxDepth: 1, totalPages: 4, - document: expect.objectContaining({ - content: expect.stringContaining("# File 2"), - metadata: expect.objectContaining({ - url: "file:///testdir/file2.html", - title: "File 2 Title", - library: "test", - version: "1.0", - }), - }), - }), + totalDiscovered: 4, + result: expect.objectContaining({ + textContent: expect.stringContaining("# File 2"), + contentType: "text/html", + url: "file:///testdir/file2.html", + title: "File 2 Title", + } satisfies Partial), + } satisfies Partial), ); // Validate .txt expect(progressCallback).toHaveBeenNthCalledWith( @@ -160,16 +169,14 @@ describe("LocalFileStrategy", () => { depth: 1, maxDepth: 1, totalPages: 4, - document: expect.objectContaining({ - content: "File 3", - metadata: expect.objectContaining({ - url: "file:///testdir/file3.txt", - title: "Untitled", - library: "test", - version: "1.0", - }), - }), - }), + totalDiscovered: 4, + result: expect.objectContaining({ + textContent: "File 3", + contentType: "text/plain", + url: "file:///testdir/file3.txt", + title: "file3.txt", + } satisfies Partial), + } satisfies Partial), ); }); @@ -183,7 +190,7 @@ describe("LocalFileStrategy", () => { maxDepth: 1, maxConcurrency: 1, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); vol.fromJSON( { @@ -207,105 +214,96 @@ describe("LocalFileStrategy", () => { // Check TypeScript file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ - document: expect.objectContaining({ + result: expect.objectContaining({ + title: "app.ts", + textContent: expect.stringContaining("interface User"), contentType: "text/x-typescript", - content: expect.stringContaining("interface User"), - metadata: expect.objectContaining({ - url: "file:///codebase/app.ts", - }), - }), - }), + url: "file:///codebase/app.ts", + } satisfies Partial), + } satisfies Partial), ); // Check TSX file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ - document: expect.objectContaining({ + result: expect.objectContaining({ + title: "component.tsx", + textContent: expect.stringContaining("export const App"), contentType: "text/x-tsx", - content: expect.stringContaining("export const App"), - metadata: expect.objectContaining({ - url: "file:///codebase/component.tsx", - }), - }), - }), + url: "file:///codebase/component.tsx", + } satisfies Partial), + } satisfies Partial), ); // Check Python file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ - document: expect.objectContaining({ + result: expect.objectContaining({ + title: "script.py", + textContent: expect.stringContaining("def hello"), contentType: "text/x-python", - content: expect.stringContaining("def hello"), - metadata: expect.objectContaining({ - url: "file:///codebase/script.py", - }), - }), - }), + url: "file:///codebase/script.py", + } satisfies Partial), + } satisfies Partial), ); // Check Go file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ - document: expect.objectContaining({ + result: expect.objectContaining({ + title: "main.go", + textContent: expect.stringContaining("package main"), contentType: "text/x-go", - content: expect.stringContaining("package main"), - metadata: expect.objectContaining({ - url: "file:///codebase/main.go", - }), - }), - }), + url: "file:///codebase/main.go", + } satisfies Partial), + } satisfies Partial), ); // Check Rust file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ - document: expect.objectContaining({ + result: expect.objectContaining({ + title: "lib.rs", + textContent: expect.stringContaining("fn main"), contentType: "text/x-rust", - content: expect.stringContaining("fn main"), - metadata: expect.objectContaining({ - url: "file:///codebase/lib.rs", - }), - }), - }), + url: "file:///codebase/lib.rs", + } satisfies Partial), + } satisfies Partial), ); // Check Kotlin file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ - document: expect.objectContaining({ + result: expect.objectContaining({ + title: "App.kt", + textContent: expect.stringContaining("fun main"), contentType: "text/x-kotlin", - content: expect.stringContaining("fun main"), - metadata: expect.objectContaining({ - url: "file:///codebase/App.kt", - }), - }), - }), + url: "file:///codebase/App.kt", + } satisfies Partial), + } satisfies Partial), ); // Check Ruby file expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ - document: expect.objectContaining({ + result: expect.objectContaining({ + title: "script.rb", + textContent: expect.stringContaining("puts"), contentType: "text/x-ruby", - content: expect.stringContaining("puts"), - metadata: expect.objectContaining({ - url: "file:///codebase/script.rb", - }), - }), - }), + url: "file:///codebase/script.rb", + } satisfies Partial), + } satisfies Partial), ); // Check Shell script expect(progressCallback).toHaveBeenCalledWith( expect.objectContaining({ - document: expect.objectContaining({ + result: expect.objectContaining({ + textContent: expect.stringContaining("#!/bin/bash"), contentType: "text/x-shellscript", - content: expect.stringContaining("#!/bin/bash"), - metadata: expect.objectContaining({ - url: "file:///codebase/run.sh", - }), - }), - }), + url: "file:///codebase/run.sh", + } satisfies Partial), + } satisfies Partial), ); }); @@ -319,7 +317,7 @@ describe("LocalFileStrategy", () => { maxDepth: 1, maxConcurrency: 1, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); vol.fromJSON( { "/testdir/empty.md": "", @@ -334,16 +332,13 @@ describe("LocalFileStrategy", () => { expect.objectContaining({ pagesScraped: 1, currentUrl: "file:///testdir/empty.md", - document: expect.objectContaining({ - content: "", - metadata: expect.objectContaining({ - title: "Untitled", - url: "file:///testdir/empty.md", - library: "test", - version: "1.0", - }), - }), - }), + result: expect.objectContaining({ + textContent: "", + contentType: "text/markdown", + title: "Untitled", + url: "file:///testdir/empty.md", + } satisfies Partial), + } satisfies Partial), ); }); @@ -357,7 +352,7 @@ describe("LocalFileStrategy", () => { maxDepth: 1, maxConcurrency: 1, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); // Simulate a binary file (with null bytes) and an image file vol.fromJSON( { @@ -397,7 +392,7 @@ describe("LocalFileStrategy", () => { includePatterns: ["/file1.md", "/file3.txt"], excludePatterns: ["/file3.txt"], // exclude takes precedence }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); vol.fromJSON( { "/testdir/file1.md": "# File 1", // should be included @@ -424,7 +419,7 @@ describe("LocalFileStrategy", () => { maxPages: 1, maxDepth: 0, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); vol.fromJSON( { "/test dir/space file.md": "# Space File\n\nThis file has spaces in its name.", @@ -437,14 +432,13 @@ describe("LocalFileStrategy", () => { expect.objectContaining({ pagesScraped: 1, currentUrl: "file:///test%20dir/space%20file.md", - document: expect.objectContaining({ - content: "# Space File\n\nThis file has spaces in its name.", - metadata: expect.objectContaining({ - url: "file:///test%20dir/space%20file.md", - title: "Space File", - }), - }), - }), + result: expect.objectContaining({ + textContent: "# Space File\n\nThis file has spaces in its name.", + contentType: "text/markdown", + url: "file:///test%20dir/space%20file.md", + title: "Space File", + } satisfies Partial), + } satisfies Partial), ); }); @@ -458,7 +452,7 @@ describe("LocalFileStrategy", () => { maxDepth: 1, maxConcurrency: 1, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); vol.fromJSON( { "/test dir/file with space.md": "# File With Space", @@ -483,7 +477,7 @@ describe("LocalFileStrategy", () => { maxPages: 1, maxDepth: 0, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); // Create a JSON file with API documentation structure const jsonContent = JSON.stringify( @@ -538,17 +532,13 @@ describe("LocalFileStrategy", () => { maxDepth: 0, totalPages: 1, totalDiscovered: 1, - document: expect.objectContaining({ - content: jsonContent, + result: expect.objectContaining({ + textContent: jsonContent, contentType: "application/json", - metadata: expect.objectContaining({ - library: "test-api", - title: "Test API Documentation", - url: "file:///api-docs.json", - version: "1.0.0", - }), - }), - }), + title: "Test API Documentation", + url: "file:///api-docs.json", + } satisfies Partial), + } satisfies Partial), ); }); @@ -562,7 +552,7 @@ describe("LocalFileStrategy", () => { maxDepth: 0, maxConcurrency: 1, }; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const testContent = "# Test Content\nThis is a test file."; vol.fromJSON( @@ -579,17 +569,314 @@ describe("LocalFileStrategy", () => { expect.objectContaining({ pagesScraped: 1, currentUrl: "file://testdir/test.md", // Original malformed URL preserved - document: expect.objectContaining({ - content: testContent, + result: expect.objectContaining({ + textContent: testContent, contentType: "text/markdown", - metadata: expect.objectContaining({ - title: "Test Content", - url: "file://testdir/test.md", - library: "test", - version: "1.0", - }), - }), - }), + title: "Test Content", + url: "file://testdir/test.md", + } satisfies Partial), + } satisfies Partial), ); }); + + describe("refresh workflow", () => { + it("should skip processing when file returns NOT_MODIFIED (unchanged)", async () => { + const strategy = new LocalFileStrategy(); + const progressCallback = vi.fn>(); + const testContent = "# Test File\nOriginal content"; + + // Create initial file with a specific mtime + vol.fromJSON({ "/test.md": testContent }, "/"); + + // Get the file stats to capture the exact mtime + const stats = await vol.promises.stat("/test.md"); + const initialMtime = stats.mtime; + + // First scrape to get the initial etag + const initialOptions: ScraperOptions = { + url: "file:///test.md", + library: "test", + version: "1.0", + maxPages: 1, + maxDepth: 0, + }; + + await strategy.scrape(initialOptions, progressCallback); + expect(progressCallback).toHaveBeenCalledTimes(1); + + // Get the etag from the first scrape + const firstCall = progressCallback.mock.calls[0][0]; + const etag = firstCall.result?.etag; + + // Verify the mtime hasn't changed + const statsAfterScrape = await vol.promises.stat("/test.md"); + expect(statsAfterScrape.mtime.getTime()).toBe(initialMtime.getTime()); + + // Reset the callback but DON'T reset the filesystem + // This preserves the file's mtime, so the etag stays the same + progressCallback.mockClear(); + + // Now do a refresh with the same etag (file unchanged) + const refreshOptions: ScraperOptions = { + url: "file:///test.md", + library: "test", + version: "1.0", + maxPages: 1, + maxDepth: 0, + initialQueue: [ + { + url: "file:///test.md", + depth: 0, + pageId: 123, + etag: etag, + }, + ], + }; + + await strategy.scrape(refreshOptions, progressCallback); + + // Verify file was checked but returned NOT_MODIFIED (no result with content) + // The root URL at depth 0 is always processed to check for changes + expect(progressCallback).toHaveBeenCalledTimes(1); + expect(progressCallback).toHaveBeenCalledWith( + expect.objectContaining({ + pagesScraped: 1, + currentUrl: "file:///test.md", + depth: 0, + result: null, // NOT_MODIFIED returns null result + pageId: 123, + }), + ); + }); + + it("should re-process file when it has been modified", async () => { + const strategy = new LocalFileStrategy(); + const progressCallback = vi.fn>(); + const originalContent = "# Original\nOriginal content"; + const updatedContent = "# Updated\nNew updated content"; + + // Create initial file + vol.fromJSON({ "/test.md": originalContent }, "/"); + + // First scrape + const initialOptions: ScraperOptions = { + url: "file:///test.md", + library: "test", + version: "1.0", + maxPages: 1, + maxDepth: 0, + }; + + await strategy.scrape(initialOptions, progressCallback); + const firstCall = progressCallback.mock.calls[0][0]; + const oldEtag = firstCall.result?.etag; + + // Modify the file (update content and mtime) + // Using a new date for fromJSON will create a new mtime + vol.reset(); + vol.fromJSON({ "/test.md": updatedContent }, "/"); + + // Wait a bit to ensure different mtime + await new Promise((resolve) => setTimeout(resolve, 10)); + + progressCallback.mockClear(); + + // Refresh with old etag + const refreshOptions: ScraperOptions = { + url: "file:///test.md", + library: "test", + version: "1.0", + maxPages: 1, + maxDepth: 0, + initialQueue: [ + { + url: "file:///test.md", + depth: 0, + pageId: 456, + etag: oldEtag, + }, + ], + }; + + await strategy.scrape(refreshOptions, progressCallback); + + // Verify file was re-processed + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(1); + expect(docCalls[0][0].result?.textContent).toContain("# Updated"); + expect(docCalls[0][0].result?.textContent).toContain("New updated content"); + expect(docCalls[0][0].result?.title).toBe("Updated"); + // Verify new etag is different + expect(docCalls[0][0].result?.etag).not.toBe(oldEtag); + }); + + it("should handle deleted files during refresh", async () => { + const strategy = new LocalFileStrategy(); + const progressCallback = vi.fn>(); + const testContent = "# Test File\nContent"; + + // Create initial file + vol.fromJSON({ "/test.md": testContent }, "/"); + + // First scrape + const initialOptions: ScraperOptions = { + url: "file:///test.md", + library: "test", + version: "1.0", + maxPages: 1, + maxDepth: 0, + }; + + await strategy.scrape(initialOptions, progressCallback); + const firstCall = progressCallback.mock.calls[0][0]; + const etag = firstCall.result?.etag; + + // Delete the file + vol.reset(); + + progressCallback.mockClear(); + + // Refresh with deleted file + const refreshOptions: ScraperOptions = { + url: "file:///test.md", + library: "test", + version: "1.0", + maxPages: 1, + maxDepth: 0, + initialQueue: [ + { + url: "file:///test.md", + depth: 0, + pageId: 789, + etag: etag, + }, + ], + }; + + await strategy.scrape(refreshOptions, progressCallback); + + // Verify no processed documents were returned + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(0); + }); + + it("should discover and process new files in a directory during refresh", async () => { + const strategy = new LocalFileStrategy(); + const progressCallback = vi.fn>(); + + // Create initial directory with one file + vol.fromJSON( + { + "/testdir/file1.md": "# File 1", + }, + "/", + ); + + // First scrape + const initialOptions: ScraperOptions = { + url: "file:///testdir", + library: "test", + version: "1.0", + maxPages: 10, + maxDepth: 1, + }; + + await strategy.scrape(initialOptions, progressCallback); + expect(progressCallback).toHaveBeenCalledTimes(1); + + // Add a new file to the directory + vol.fromJSON( + { + "/testdir/file1.md": "# File 1", + "/testdir/file2.md": "# File 2\nNew file added", + }, + "/", + ); + + progressCallback.mockClear(); + + // Refresh the directory (directories don't use etag, they just re-scan) + const refreshOptions: ScraperOptions = { + url: "file:///testdir", + library: "test", + version: "1.0", + maxPages: 10, + maxDepth: 1, + }; + + await strategy.scrape(refreshOptions, progressCallback); + + // Should process both files + expect(progressCallback).toHaveBeenCalledTimes(2); + const calledUrls = progressCallback.mock.calls.map((call) => call[0].currentUrl); + expect(calledUrls).toContain("file:///testdir/file1.md"); + expect(calledUrls).toContain("file:///testdir/file2.md"); + }); + + it("should preserve depth from original scrape during refresh for nested files", async () => { + const strategy = new LocalFileStrategy(); + const progressCallback = vi.fn>(); + + vol.fromJSON( + { + "/testdir/subdir/deep/file.md": "# Deep File\nOriginal content", + }, + "/", + ); + + // First scrape starting from directory - file will be discovered at depth 3 + const initialOptions: ScraperOptions = { + url: "file:///testdir", + library: "test", + version: "1.0", + maxPages: 10, + maxDepth: 3, + }; + + await strategy.scrape(initialOptions, progressCallback); + expect(progressCallback).toHaveBeenCalledTimes(1); + const firstCall = progressCallback.mock.calls[0][0]; + expect(firstCall.depth).toBe(3); // File discovered at depth 3 + const etag = firstCall.result?.etag; + + // Update the file with new content + vol.reset(); + vol.fromJSON( + { + "/testdir/subdir/deep/file.md": "# Deep File\nUpdated content", + }, + "/", + ); + + await new Promise((resolve) => setTimeout(resolve, 10)); + + progressCallback.mockClear(); + + // Refresh starting from same directory with file in initialQueue at depth 3 + const refreshOptions: ScraperOptions = { + url: "file:///testdir", + library: "test", + version: "1.0", + maxPages: 10, + maxDepth: 3, + initialQueue: [ + { + url: "file:///testdir/subdir/deep/file.md", + depth: 3, // Original depth from discovery + pageId: 555, + etag: etag, + }, + ], + }; + + await strategy.scrape(refreshOptions, progressCallback); + + // Verify file was re-processed and depth from initialQueue is preserved + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(1); + expect(docCalls[0][0].depth).toBe(3); + expect(docCalls[0][0].pageId).toBe(555); + expect(docCalls[0][0].result?.textContent).toContain("Updated content"); + }); + }); }); diff --git a/src/scraper/strategies/LocalFileStrategy.ts b/src/scraper/strategies/LocalFileStrategy.ts index 1edfa9d7..d523a67f 100644 --- a/src/scraper/strategies/LocalFileStrategy.ts +++ b/src/scraper/strategies/LocalFileStrategy.ts @@ -1,13 +1,12 @@ import fs from "node:fs/promises"; import path from "node:path"; -import type { Document, ProgressCallback } from "../../types"; import { logger } from "../../utils/logger"; import { FileFetcher } from "../fetcher"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import { PipelineFactory } from "../pipelines/PipelineFactory"; -import type { ContentPipeline } from "../pipelines/types"; -import type { ScraperOptions, ScraperProgress } from "../types"; -import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy"; +import type { ContentPipeline, PipelineResult } from "../pipelines/types"; +import type { QueueItem, ScraperOptions } from "../types"; +import { BaseScraperStrategy, type ProcessItemResult } from "./BaseScraperStrategy"; /** * LocalFileStrategy handles crawling and scraping of local files and folders using file:// URLs. @@ -29,12 +28,11 @@ export class LocalFileStrategy extends BaseScraperStrategy { return url.startsWith("file://"); } - protected async processItem( + async processItem( item: QueueItem, options: ScraperOptions, - _progressCallback?: ProgressCallback, _signal?: AbortSignal, - ): Promise<{ document?: Document; links?: string[] }> { + ): Promise { // Parse the file URL properly to handle both file:// and file:/// formats let filePath = item.url.replace(/^file:\/\/\/?/, ""); filePath = decodeURIComponent(filePath); @@ -44,7 +42,21 @@ export class LocalFileStrategy extends BaseScraperStrategy { filePath = `/${filePath}`; } - const stats = await fs.stat(filePath); + let stats: Awaited>; + try { + stats = await fs.stat(filePath); + } catch (error) { + // File not found + if ((error as NodeJS.ErrnoException).code === "ENOENT") { + logger.info(`✓ File deleted or not available: ${filePath}`); + return { + url: item.url, + links: [], + status: FetchStatus.NOT_FOUND, + }; + } + throw error; + } if (stats.isDirectory()) { const contents = await fs.readdir(filePath); @@ -52,17 +64,23 @@ export class LocalFileStrategy extends BaseScraperStrategy { const links = contents .map((name) => `file://${path.join(filePath, name)}`) .filter((url) => this.shouldProcessUrl(url, options)); - return { links }; + return { url: item.url, links, status: FetchStatus.SUCCESS }; } - logger.info(`🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`); + const rawContent: RawContent = await this.fileFetcher.fetch(item.url, { + etag: item.etag, + }); - const rawContent: RawContent = await this.fileFetcher.fetch(item.url); + // Handle NOT_MODIFIED status (file hasn't changed) + if (rawContent.status === FetchStatus.NOT_MODIFIED) { + logger.debug(`✓ File unchanged: ${filePath}`); + return { url: rawContent.source, links: [], status: FetchStatus.NOT_MODIFIED }; + } - let processed: Awaited> | undefined; + let processed: PipelineResult | undefined; for (const pipeline of this.pipelines) { - if (pipeline.canProcess(rawContent)) { + if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) { logger.debug( `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${filePath})`, ); @@ -75,27 +93,28 @@ export class LocalFileStrategy extends BaseScraperStrategy { logger.warn( `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`, ); - return { document: undefined, links: [] }; + return { url: rawContent.source, links: [], status: FetchStatus.SUCCESS }; } - for (const err of processed.errors) { + for (const err of processed.errors ?? []) { logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`); } + // Use filename as fallback if title is empty or not a string + const filename = path.basename(filePath); + const title = processed.title?.trim() || filename || null; + + // For local files, we don't follow links (no crawling within file content) + // Return empty links array return { - document: { - content: typeof processed.textContent === "string" ? processed.textContent : "", - contentType: rawContent.mimeType, - metadata: { - url: rawContent.source, - title: - typeof processed.metadata.title === "string" - ? processed.metadata.title - : "Untitled", - library: options.library, - version: options.version, - }, - } satisfies Document, + url: rawContent.source, + title: title, + etag: rawContent.etag, + lastModified: rawContent.lastModified, + contentType: rawContent.mimeType, + content: processed, + links: [], + status: FetchStatus.SUCCESS, }; } diff --git a/src/scraper/strategies/NpmScraperStrategy.ts b/src/scraper/strategies/NpmScraperStrategy.ts index 336b74ab..ce0bd199 100644 --- a/src/scraper/strategies/NpmScraperStrategy.ts +++ b/src/scraper/strategies/NpmScraperStrategy.ts @@ -1,5 +1,5 @@ import type { ProgressCallback } from "../../types"; -import type { ScraperOptions, ScraperProgress, ScraperStrategy } from "../types"; +import type { ScraperOptions, ScraperProgressEvent, ScraperStrategy } from "../types"; import { WebScraperStrategy } from "./WebScraperStrategy"; export class NpmScraperStrategy implements ScraperStrategy { @@ -23,7 +23,7 @@ export class NpmScraperStrategy implements ScraperStrategy { async scrape( options: ScraperOptions, - progressCallback: ProgressCallback, + progressCallback: ProgressCallback, signal?: AbortSignal, ): Promise { // Use default strategy with our configuration, passing the signal diff --git a/src/scraper/strategies/PyPiScraperStrategy.ts b/src/scraper/strategies/PyPiScraperStrategy.ts index abe31f7b..50360654 100644 --- a/src/scraper/strategies/PyPiScraperStrategy.ts +++ b/src/scraper/strategies/PyPiScraperStrategy.ts @@ -1,5 +1,5 @@ import type { ProgressCallback } from "../../types"; -import type { ScraperOptions, ScraperProgress, ScraperStrategy } from "../types"; +import type { ScraperOptions, ScraperProgressEvent, ScraperStrategy } from "../types"; import { WebScraperStrategy } from "./WebScraperStrategy"; export class PyPiScraperStrategy implements ScraperStrategy { @@ -23,7 +23,7 @@ export class PyPiScraperStrategy implements ScraperStrategy { async scrape( options: ScraperOptions, - progressCallback: ProgressCallback, + progressCallback: ProgressCallback, signal?: AbortSignal, ): Promise { // Use default strategy with our configuration, passing the signal diff --git a/src/scraper/strategies/WebScraperStrategy.test.ts b/src/scraper/strategies/WebScraperStrategy.test.ts index e28f1d8b..44461660 100644 --- a/src/scraper/strategies/WebScraperStrategy.test.ts +++ b/src/scraper/strategies/WebScraperStrategy.test.ts @@ -1,6 +1,7 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; -import type { Document } from "../../types"; -import type { ScraperOptions } from "../types"; +import type { ProgressCallback } from "../../types"; +import { FetchStatus } from "../fetcher/types"; +import type { ScrapeResult, ScraperOptions, ScraperProgressEvent } from "../types"; import { ScrapeMode } from "../types"; // Import ScrapeMode import { WebScraperStrategy } from "./WebScraperStrategy"; @@ -32,6 +33,7 @@ describe("WebScraperStrategy", () => { content: "

Default Mock Content

", mimeType: "text/html", source: "https://example.com", // Default source + status: FetchStatus.SUCCESS, }); // Create a fresh instance of the strategy for each test @@ -67,7 +69,7 @@ describe("WebScraperStrategy", () => { }, 10000); it("should use HttpFetcher to fetch content and process result", async () => { - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const testUrl = "https://example.com"; options.url = testUrl; // Ensure options match @@ -77,6 +79,7 @@ describe("WebScraperStrategy", () => { content: `${expectedTitle}

Fetched Content

`, mimeType: "text/html", source: testUrl, + status: FetchStatus.SUCCESS, }); await strategy.scrape(options, progressCallback); @@ -90,17 +93,17 @@ describe("WebScraperStrategy", () => { // Verify that the pipeline processed and called the callback with a document expect(progressCallback).toHaveBeenCalled(); const documentProcessingCall = progressCallback.mock.calls.find( - (call) => call[0].document, + (call) => call[0].result, ); expect(documentProcessingCall).toBeDefined(); // Use non-null assertion operator (!) since we've asserted it's defined - expect(documentProcessingCall![0].document.content).toBe("# Fetched Content"); // Check processed markdown (from H1) - expect(documentProcessingCall![0].document.metadata.title).toBe(expectedTitle); // Check extracted title (from ) + expect(documentProcessingCall![0].result?.textContent).toBe("# Fetched Content"); // Check processed markdown (from H1) + expect(documentProcessingCall![0].result?.title).toBe(expectedTitle); // Check extracted title (from <title>) }, 10000); it("should respect the followRedirects option", async () => { options.followRedirects = false; - const progressCallback = vi.fn(); + const progressCallback = vi.fn<ProgressCallback<ScraperProgressEvent>>(); await strategy.scrape(options, progressCallback); @@ -112,7 +115,7 @@ describe("WebScraperStrategy", () => { // Also check that processing still happened expect(progressCallback).toHaveBeenCalled(); const documentProcessingCall = progressCallback.mock.calls.find( - (call) => call[0].document, + (call) => call[0].result, ); expect(documentProcessingCall).toBeDefined(); }, 10000); @@ -134,19 +137,25 @@ describe("WebScraperStrategy", () => { mockFetchFn.mockImplementation(async (url: string) => { if (url === "https://example.com") - return { content: baseHtml, mimeType: "text/html", source: url }; + return { + content: baseHtml, + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + }; // Return simple content for subpages, title reflects URL return { content: `<html><head><title>${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); options.scope = "subpages"; options.maxDepth = 1; // Limit depth for simplicity options.maxPages = 5; // Allow enough pages - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); @@ -174,22 +183,17 @@ describe("WebScraperStrategy", () => { ); // Verify documents via callback - const receivedDocs = progressCallback.mock.calls - .map((call) => call[0].document) - .filter((doc): doc is Document => doc !== undefined); // Type guard - + const receivedDocs = progressCallback.mock.calls.map((call) => call[0].result); expect(receivedDocs).toHaveLength(4); - expect(receivedDocs.some((doc) => doc.metadata.title === "Test Site")).toBe(true); + expect(receivedDocs.some((doc) => doc?.title === "Test Site")).toBe(true); expect( - receivedDocs.some((doc) => doc.metadata.title === "https://example.com/subpage1"), + receivedDocs.some((doc) => doc?.title === "https://example.com/subpage1"), ).toBe(true); expect( - receivedDocs.some((doc) => doc.metadata.title === "https://example.com/subpage2/"), + receivedDocs.some((doc) => doc?.title === "https://example.com/subpage2/"), ).toBe(true); expect( - receivedDocs.some( - (doc) => doc.metadata.title === "https://example.com/relative-path", - ), + receivedDocs.some((doc) => doc?.title === "https://example.com/relative-path"), ).toBe(true); }, 10000); @@ -201,18 +205,20 @@ describe("WebScraperStrategy", () => { 'BaseSubAPIOther', mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); options.scope = "hostname"; options.maxDepth = 1; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); @@ -229,14 +235,12 @@ describe("WebScraperStrategy", () => { expect(mockFetchFn).not.toHaveBeenCalledWith("https://other.com", expect.anything()); // Verify documents via callback - const receivedDocs = progressCallback.mock.calls - .map((call) => call[0].document) - .filter((doc): doc is Document => doc !== undefined); + const receivedDocs = progressCallback.mock.calls.map((call) => call[0].result); expect(receivedDocs).toHaveLength(2); - expect(receivedDocs.some((doc) => doc.metadata.title === "Base")).toBe(true); - expect( - receivedDocs.some((doc) => doc.metadata.title === "https://example.com/subpage"), - ).toBe(true); + expect(receivedDocs.some((doc) => doc?.title === "Base")).toBe(true); + expect(receivedDocs.some((doc) => doc?.title === "https://example.com/subpage")).toBe( + true, + ); }, 10000); it("should follow links based on scope=domain", async () => { @@ -247,18 +251,20 @@ describe("WebScraperStrategy", () => { 'BaseSubAPIOther', mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); options.scope = "domain"; options.maxDepth = 1; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); @@ -275,17 +281,15 @@ describe("WebScraperStrategy", () => { expect(mockFetchFn).not.toHaveBeenCalledWith("https://other.com", expect.anything()); // Verify documents via callback - const receivedDocs = progressCallback.mock.calls - .map((call) => call[0].document) - .filter((doc): doc is Document => doc !== undefined); + const receivedDocs = progressCallback.mock.calls.map((call) => call[0].result); expect(receivedDocs).toHaveLength(3); - expect(receivedDocs.some((doc) => doc.metadata.title === "Base")).toBe(true); - expect( - receivedDocs.some((doc) => doc.metadata.title === "https://example.com/subpage"), - ).toBe(true); - expect( - receivedDocs.some((doc) => doc.metadata.title === "https://api.example.com/ep"), - ).toBe(true); + expect(receivedDocs.some((doc) => doc?.title === "Base")).toBe(true); + expect(receivedDocs.some((doc) => doc?.title === "https://example.com/subpage")).toBe( + true, + ); + expect(receivedDocs.some((doc) => doc?.title === "https://api.example.com/ep")).toBe( + true, + ); }, 10000); // --- Limit Tests --- @@ -300,6 +304,7 @@ describe("WebScraperStrategy", () => { 'L0L1', mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } if (url === "https://example.com/level1") { @@ -309,6 +314,7 @@ describe("WebScraperStrategy", () => { 'L1L2', mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } if (url === "https://example.com/level2") { @@ -318,6 +324,7 @@ describe("WebScraperStrategy", () => { 'L2L3', mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } // Default for unexpected calls @@ -325,11 +332,12 @@ describe("WebScraperStrategy", () => { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); options.maxDepth = 1; // Limit depth - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); @@ -345,12 +353,10 @@ describe("WebScraperStrategy", () => { ); // Exceeds depth // Verify documents via callback - const receivedDocs = progressCallback.mock.calls - .map((call) => call[0].document) - .filter((doc): doc is Document => doc !== undefined); + const receivedDocs = progressCallback.mock.calls.map((call) => call[0].result); expect(receivedDocs).toHaveLength(2); // Base (L0) + L1 - expect(receivedDocs.some((doc) => doc.metadata.title === "L0")).toBe(true); - expect(receivedDocs.some((doc) => doc.metadata.title === "L1")).toBe(true); + expect(receivedDocs.some((doc) => doc?.title === "L0")).toBe(true); + expect(receivedDocs.some((doc) => doc?.title === "L1")).toBe(true); }, 10000); it("should respect maxPages option", async () => { @@ -362,17 +368,19 @@ describe("WebScraperStrategy", () => { 'Base123', mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); options.maxPages = 2; // Limit pages - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); @@ -396,9 +404,7 @@ describe("WebScraperStrategy", () => { expect(subpagesFetchedCount).toBe(1); // Exactly one subpage fetched // Verify documents via callback - const receivedDocs = progressCallback.mock.calls - .map((call) => call[0].document) - .filter((doc): doc is Document => doc !== undefined); + const receivedDocs = progressCallback.mock.calls.map((call) => call[0].result); expect(receivedDocs).toHaveLength(2); // Base + 1 subpage }, 10000); @@ -413,23 +419,25 @@ describe("WebScraperStrategy", () => { 'Base12', mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); options.maxPages = 3; // Allow all pages options.maxDepth = 1; await strategy.scrape(options, progressCallback); // Verify callback calls - const callsWithDocs = progressCallback.mock.calls.filter((call) => call[0].document); + const callsWithDocs = progressCallback.mock.calls.filter((call) => call[0].result); expect(callsWithDocs).toHaveLength(3); // Base + page1 + page2 // Check structure of a progress call with a document @@ -439,19 +447,15 @@ describe("WebScraperStrategy", () => { currentUrl: expect.any(String), depth: expect.any(Number), maxDepth: options.maxDepth, - document: expect.objectContaining({ - content: expect.any(String), - metadata: expect.objectContaining({ - url: expect.any(String), - title: expect.any(String), // Title comes from pipeline now - library: options.library, - version: options.version, - }), - }), - }); + result: expect.objectContaining({ + textContent: expect.any(String), + url: expect.any(String), + title: expect.any(String), + } satisfies Partial), + } satisfies Partial); // Check specific URLs reported - const reportedUrls = callsWithDocs.map((call) => call[0].document.metadata.url); + const reportedUrls = callsWithDocs.map((call) => call[0].result?.url); expect(reportedUrls).toEqual( expect.arrayContaining([ "https://example.com", @@ -477,9 +481,10 @@ describe("WebScraperStrategy", () => { content: `${expectedTitle}

Processed Content

`, mimeType: "text/html", source: urlWithCreds, + status: FetchStatus.SUCCESS, }); - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); // Ensure fetch was called with the credentialed URL @@ -488,14 +493,14 @@ describe("WebScraperStrategy", () => { expect.objectContaining({ followRedirects: true }), ); // Ensure a document was produced with the expected markdown and title - const docCall = progressCallback.mock.calls.find((call) => call[0].document); + const docCall = progressCallback.mock.calls.find((call) => call[0].result); expect(docCall).toBeDefined(); - expect(docCall![0].document.content).toContain(expectedMarkdown); - expect(docCall![0].document.metadata.title).toBe(expectedTitle); + expect(docCall![0].result?.textContent).toContain(expectedMarkdown); + expect(docCall![0].result?.title).toBe(expectedTitle); }, 10000); // Keep timeout for consistency but test should run quickly with fetch mode it("should forward custom headers to HttpFetcher", async () => { - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const testUrl = "https://example.com"; options.url = testUrl; options.headers = { @@ -506,6 +511,7 @@ describe("WebScraperStrategy", () => { content: "Header Test", mimeType: "text/html", source: testUrl, + status: FetchStatus.SUCCESS, }); await strategy.scrape(options, progressCallback); expect(mockFetchFn).toHaveBeenCalledWith( @@ -521,7 +527,7 @@ describe("WebScraperStrategy", () => { describe("pipeline selection", () => { it("should process HTML content through HtmlPipeline", async () => { - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const testUrl = "https://example.com"; options.url = testUrl; @@ -530,19 +536,20 @@ describe("WebScraperStrategy", () => { "HTML Test

HTML Content

", mimeType: "text/html", source: testUrl, + status: FetchStatus.SUCCESS, }); await strategy.scrape(options, progressCallback); // Verify HTML content was processed (converted to markdown) - const docCall = progressCallback.mock.calls.find((call) => call[0].document); + const docCall = progressCallback.mock.calls.find((call) => call[0].result); expect(docCall).toBeDefined(); - expect(docCall![0].document.content).toContain("# HTML Content"); - expect(docCall![0].document.metadata.title).toBe("HTML Test"); + expect(docCall![0].result?.textContent).toContain("# HTML Content"); + expect(docCall![0].result?.title).toBe("HTML Test"); }); it("should process markdown content through MarkdownPipeline", async () => { - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const testUrl = "https://example.com/readme.md"; options.url = testUrl; @@ -551,19 +558,22 @@ describe("WebScraperStrategy", () => { content: markdownContent, mimeType: "text/markdown", source: testUrl, + status: FetchStatus.SUCCESS, }); await strategy.scrape(options, progressCallback); // Verify markdown content was processed - const docCall = progressCallback.mock.calls.find((call) => call[0].document); + const docCall = progressCallback.mock.calls.find((call) => call[0].result); expect(docCall).toBeDefined(); - expect(docCall![0].document.content).toContain("# Markdown Title"); - expect(docCall![0].document.content).toContain("This is already markdown content."); + expect(docCall![0].result?.textContent).toContain("# Markdown Title"); + expect(docCall![0].result?.textContent).toContain( + "This is already markdown content.", + ); }); it("should skip unsupported content types", async () => { - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const testUrl = "https://example.com/image.png"; options.url = testUrl; @@ -571,19 +581,20 @@ describe("WebScraperStrategy", () => { content: Buffer.from([0x89, 0x50, 0x4e, 0x47]), // PNG header mimeType: "image/png", source: testUrl, + status: FetchStatus.SUCCESS, }); await strategy.scrape(options, progressCallback); // Verify no document was produced for unsupported content - const docCall = progressCallback.mock.calls.find((call) => call[0].document); + const docCall = progressCallback.mock.calls.find((call) => call[0].result); expect(docCall).toBeUndefined(); }); }); describe("error handling", () => { it("should handle fetch failures gracefully", async () => { - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const testUrl = "https://example.com/error"; options.url = testUrl; @@ -595,12 +606,12 @@ describe("WebScraperStrategy", () => { ); // Verify no documents were processed - const docCalls = progressCallback.mock.calls.filter((call) => call[0].document); + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); expect(docCalls).toHaveLength(0); }); it("should handle empty content gracefully", async () => { - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); const testUrl = "https://example.com/empty"; options.url = testUrl; @@ -608,6 +619,7 @@ describe("WebScraperStrategy", () => { content: "", // Empty content mimeType: "text/html", source: testUrl, + status: FetchStatus.SUCCESS, }); await strategy.scrape(options, progressCallback); @@ -640,17 +652,19 @@ describe("WebScraperStrategy", () => { `, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); options.maxDepth = 1; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await customStrategy.scrape(options, progressCallback); @@ -673,9 +687,7 @@ describe("WebScraperStrategy", () => { ); // Verify documents were produced for allowed pages - const receivedDocs = progressCallback.mock.calls - .map((call) => call[0].document) - .filter((doc): doc is Document => doc !== undefined); + const receivedDocs = progressCallback.mock.calls.map((call) => call[0].result); expect(receivedDocs).toHaveLength(3); // Base + 2 allowed pages }); }); @@ -695,12 +707,14 @@ describe("WebScraperStrategy", () => { content: `Link`, mimeType: "text/html", source: canonical, // Final URL after redirect + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); @@ -708,7 +722,7 @@ describe("WebScraperStrategy", () => { options.maxDepth = 1; options.maxPages = 5; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); expect(mockFetchFn).toHaveBeenCalledWith(original, expect.anything()); @@ -732,12 +746,14 @@ describe("WebScraperStrategy", () => { `, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); @@ -746,7 +762,7 @@ describe("WebScraperStrategy", () => { options.maxDepth = 1; options.maxPages = 5; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); expect(mockFetchFn).toHaveBeenCalledWith(start, expect.anything()); @@ -771,12 +787,14 @@ describe("WebScraperStrategy", () => { `, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); @@ -785,7 +803,7 @@ describe("WebScraperStrategy", () => { options.maxDepth = 1; options.maxPages = 10; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); expect(mockFetchFn).toHaveBeenCalledWith(start, expect.anything()); @@ -805,12 +823,14 @@ describe("WebScraperStrategy", () => { content: `Nested`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } return { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); @@ -819,7 +839,7 @@ describe("WebScraperStrategy", () => { options.maxDepth = 1; options.maxPages = 5; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); expect(mockFetchFn).toHaveBeenCalledWith(startDir, expect.anything()); @@ -838,6 +858,7 @@ describe("WebScraperStrategy", () => { content: `Script`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; } // Any unexpected fetches return generic content @@ -845,6 +866,7 @@ describe("WebScraperStrategy", () => { content: `${url}${url}`, mimeType: "text/html", source: url, + status: FetchStatus.SUCCESS, }; }); @@ -853,7 +875,7 @@ describe("WebScraperStrategy", () => { options.maxDepth = 1; options.maxPages = 5; - const progressCallback = vi.fn(); + const progressCallback = vi.fn>(); await strategy.scrape(options, progressCallback); // Should fetch only the start page; the cross-origin (different hostname) base-derived link is filtered out @@ -866,14 +888,16 @@ describe("WebScraperStrategy", () => { const strategy = new WebScraperStrategy(); // Spy on the close method of all pipelines - (strategy as any).pipelines.forEach((pipeline: any) => { + // @ts-expect-error - pipelines is private, but we need to access it for testing + strategy.pipelines.forEach((pipeline: any) => { vi.spyOn(pipeline, "close"); }); await strategy.cleanup(); // Verify close was called on all pipelines - (strategy as any).pipelines.forEach((pipeline: any) => { + // @ts-expect-error - pipelines is private, but we need to access it for testing + strategy.pipelines.forEach((pipeline: any) => { expect(pipeline.close).toHaveBeenCalledOnce(); }); }); @@ -882,7 +906,8 @@ describe("WebScraperStrategy", () => { const strategy = new WebScraperStrategy(); // Mock one pipeline to throw an error during cleanup - vi.spyOn((strategy as any).pipelines[0], "close").mockRejectedValue( + // @ts-expect-error - pipelines is private, but we need to access it for testing + vi.spyOn(strategy.pipelines[0], "close").mockRejectedValue( new Error("Pipeline cleanup failed"), ); @@ -898,4 +923,353 @@ describe("WebScraperStrategy", () => { await expect(strategy.cleanup()).resolves.not.toThrow(); }); }); + + describe("refresh workflow", () => { + beforeEach(() => { + vi.resetAllMocks(); + mockFetchFn.mockResolvedValue({ + content: "

Default Mock Content

", + mimeType: "text/html", + source: "https://example.com", + status: FetchStatus.SUCCESS, + }); + strategy = new WebScraperStrategy(); + options = { + url: "https://example.com", + library: "test", + version: "1.0", + maxPages: 99, + maxDepth: 3, + scope: "subpages", + followRedirects: true, + scrapeMode: ScrapeMode.Fetch, + }; + }); + + it("should skip processing when page returns 304 Not Modified", async () => { + const progressCallback = vi.fn>(); + + // Configure mock to return 304 for a refresh operation + mockFetchFn.mockResolvedValue({ + content: "", + mimeType: "text/html", + source: "https://example.com/page1", + status: FetchStatus.NOT_MODIFIED, + }); + + // Create a queue item with pageId and etag (refresh operation) + options.initialQueue = [ + { + url: "https://example.com/page1", + depth: 0, + pageId: 123, + etag: "existing-etag", + }, + ]; + + await strategy.scrape(options, progressCallback); + + // Verify fetch was called with etag + expect(mockFetchFn).toHaveBeenCalledWith( + "https://example.com/page1", + expect.objectContaining({ + etag: "existing-etag", + }), + ); + + // Verify no documents were processed (304 means unchanged) + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(0); + }); + + it("should report deleted flag when page returns 404 Not Found during refresh", async () => { + const progressCallback = vi.fn>(); + + // Configure mock to return 404 + mockFetchFn.mockResolvedValue({ + content: "", + mimeType: "text/html", + source: "https://example.com/deleted-page", + status: FetchStatus.NOT_FOUND, + }); + + // Create a queue item with pageId and etag (refresh operation) + options.initialQueue = [ + { + url: "https://example.com/deleted-page", + depth: 0, + pageId: 456, + etag: "old-etag", + }, + ]; + + await strategy.scrape(options, progressCallback); + + // Verify fetch was called + expect(mockFetchFn).toHaveBeenCalledWith( + "https://example.com/deleted-page", + expect.objectContaining({ + etag: "old-etag", + }), + ); + + // Verify no processed documents were returned + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(0); + }); + + it("should refresh page content when page returns 200 OK", async () => { + const progressCallback = vi.fn>(); + const rootContent = + "Root

Root

"; + const updatedContent = + "Updated

New Content

"; + + // Configure mock to return different content for root vs updated page + mockFetchFn.mockImplementation(async (url: string) => { + if (url === "https://example.com") { + return { + content: rootContent, + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + }; + } + return { + content: updatedContent, + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + etag: "new-etag", + }; + }); + + // Create a queue item with pageId and etag (refresh operation) + options.initialQueue = [ + { + url: "https://example.com/updated-page", + depth: 1, + pageId: 789, + etag: "old-etag", + }, + ]; + + await strategy.scrape(options, progressCallback); + + // Verify fetch was called for both root and updated page + expect(mockFetchFn).toHaveBeenCalledWith("https://example.com", expect.anything()); + expect(mockFetchFn).toHaveBeenCalledWith( + "https://example.com/updated-page", + expect.objectContaining({ + etag: "old-etag", + }), + ); + + // Verify both pages were processed (root at depth 0, updated page at depth 1) + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(2); + + // Find the updated page call + const updatedPageCall = docCalls.find( + (call) => call[0].currentUrl === "https://example.com/updated-page", + ); + expect(updatedPageCall).toBeDefined(); + expect(updatedPageCall![0].result?.textContent).toContain("# New Content"); + expect(updatedPageCall![0].result?.title).toBe("Updated"); + expect(updatedPageCall![0].result?.etag).toBe("new-etag"); + }); + + it("should discover and follow new links during refresh operations", async () => { + const progressCallback = vi.fn>(); + const rootContent = + "Root

Root

"; + const contentWithLinks = ` + + Refreshed Page + +

Content

+ New Link + Another New Link + + + `; + + // Configure mock to return different content for root vs page + mockFetchFn.mockImplementation(async (url: string) => { + if (url === "https://example.com") { + return { + content: rootContent, + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + }; + } + return { + content: contentWithLinks, + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + etag: "new-etag", + }; + }); + + // Create a queue item with pageId and etag (refresh operation) + options.initialQueue = [ + { + url: "https://example.com/page-with-links", + depth: 1, + pageId: 999, + etag: "old-etag", + }, + ]; + + await strategy.scrape(options, progressCallback); + + // Verify root, refresh page, and discovered links were all fetched + // Root (depth 0) + refresh page (depth 1) + 2 new links (depth 2) = 4 total + expect(mockFetchFn).toHaveBeenCalledTimes(4); + expect(mockFetchFn).toHaveBeenCalledWith("https://example.com", expect.anything()); + expect(mockFetchFn).toHaveBeenCalledWith( + "https://example.com/page-with-links", + expect.anything(), + ); + + // Verify the new links discovered during refresh WERE followed (this is correct behavior) + expect(mockFetchFn).toHaveBeenCalledWith( + "https://example.com/new-link", + expect.anything(), + ); + expect(mockFetchFn).toHaveBeenCalledWith( + "https://example.com/another-new-link", + expect.anything(), + ); + }); + + it("should process multiple pages in a refresh operation with mixed statuses", async () => { + const progressCallback = vi.fn>(); + + // Configure mock to return different statuses for different URLs + mockFetchFn.mockImplementation(async (url: string) => { + if (url === "https://example.com/unchanged") { + return { + content: "", + mimeType: "text/html", + source: url, + status: FetchStatus.NOT_MODIFIED, + }; + } + if (url === "https://example.com/deleted") { + return { + content: "", + mimeType: "text/html", + source: url, + status: FetchStatus.NOT_FOUND, + }; + } + if (url === "https://example.com/updated") { + return { + content: + "Updated

New

", + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + etag: "new-etag", + }; + } + return { + content: "Default", + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + }; + }); + + // Create a queue with multiple pages (all at depth > 0 to avoid root URL processing) + options.initialQueue = [ + { + url: "https://example.com/unchanged", + depth: 1, + pageId: 1, + etag: "etag-1", + }, + { + url: "https://example.com/deleted", + depth: 1, + pageId: 2, + etag: "etag-2", + }, + { + url: "https://example.com/updated", + depth: 1, + pageId: 3, + etag: "etag-3", + }, + ]; + + await strategy.scrape(options, progressCallback); + + // Verify all three pages plus root were fetched (4 total) + expect(mockFetchFn).toHaveBeenCalledTimes(4); + + // Verify root was processed + only the updated page produced a processed document (2 total) + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(2); + + // Find the updated page (not the root) + const updatedPageCall = docCalls.find( + (call) => call[0].currentUrl === "https://example.com/updated", + ); + expect(updatedPageCall).toBeDefined(); + expect(updatedPageCall![0].result?.url).toBe("https://example.com/updated"); + expect(updatedPageCall![0].result?.title).toBe("Updated"); + }); + + it("should preserve depth from original scrape during refresh", async () => { + const progressCallback = vi.fn>(); + + mockFetchFn.mockImplementation(async (url: string) => { + if (url === "https://example.com") { + return { + content: + "Root

Root

", + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + }; + } + return { + content: + "Depth Test

Content

", + mimeType: "text/html", + source: url, + status: FetchStatus.SUCCESS, + etag: "new-etag", + }; + }); + + // Create a queue item with depth from original scrape + options.initialQueue = [ + { + url: "https://example.com/deep-page", + depth: 2, // This page was originally scraped at depth 2 + pageId: 555, + etag: "old-etag", + }, + ]; + + await strategy.scrape(options, progressCallback); + + // Verify both root and deep page were processed (2 documents) + const docCalls = progressCallback.mock.calls.filter((call) => call[0].result); + expect(docCalls).toHaveLength(2); + + // Find the deep page and verify it preserved its depth + const deepPageCall = docCalls.find( + (call) => call[0].currentUrl === "https://example.com/deep-page", + ); + expect(deepPageCall).toBeDefined(); + expect(deepPageCall![0].depth).toBe(2); + expect(deepPageCall![0].pageId).toBe(555); + }); + }); }); diff --git a/src/scraper/strategies/WebScraperStrategy.ts b/src/scraper/strategies/WebScraperStrategy.ts index 6fcf71a0..702650d6 100644 --- a/src/scraper/strategies/WebScraperStrategy.ts +++ b/src/scraper/strategies/WebScraperStrategy.ts @@ -1,13 +1,12 @@ -import type { Document, ProgressCallback } from "../../types"; import { logger } from "../../utils/logger"; import type { UrlNormalizerOptions } from "../../utils/url"; import { AutoDetectFetcher } from "../fetcher"; -import type { RawContent } from "../fetcher/types"; +import { FetchStatus, type RawContent } from "../fetcher/types"; import { PipelineFactory } from "../pipelines/PipelineFactory"; -import type { ContentPipeline, ProcessedContent } from "../pipelines/types"; -import type { ScraperOptions, ScraperProgress } from "../types"; +import type { ContentPipeline, PipelineResult } from "../pipelines/types"; +import type { QueueItem, ScraperOptions } from "../types"; import { isInScope } from "../utils/scope"; -import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy"; +import { BaseScraperStrategy, type ProcessItemResult } from "./BaseScraperStrategy"; export interface WebScraperStrategyOptions { urlNormalizerOptions?: UrlNormalizerOptions; @@ -47,26 +46,45 @@ export class WebScraperStrategy extends BaseScraperStrategy { protected override async processItem( item: QueueItem, options: ScraperOptions, - _progressCallback?: ProgressCallback, // Base class passes it, but not used here - signal?: AbortSignal, // Add signal - ): Promise<{ document?: Document; links?: string[]; finalUrl?: string }> { + signal?: AbortSignal, + ): Promise { const { url } = item; try { - // Define fetch options, passing signal, followRedirects, and headers + // Log when processing with ETag for conditional requests + if (item.etag) { + logger.debug(`Processing ${url} with stored ETag: ${item.etag}`); + } + + // Define fetch options, passing signal, followRedirects, headers, and etag const fetchOptions = { signal, followRedirects: options.followRedirects, headers: options.headers, // Forward custom headers + etag: item.etag, // Pass ETag for conditional requests }; // Use AutoDetectFetcher which handles fallbacks automatically const rawContent: RawContent = await this.fetcher.fetch(url, fetchOptions); + logger.debug( + `Fetch result for ${url}: status=${rawContent.status}, etag=${rawContent.etag || "none"}`, + ); + + // Return the status directly - BaseScraperStrategy handles NOT_MODIFIED and NOT_FOUND + // Use the final URL from rawContent.source (which may differ due to redirects) + if (rawContent.status !== FetchStatus.SUCCESS) { + logger.debug(`Skipping pipeline for ${url} due to status: ${rawContent.status}`); + return { url: rawContent.source, links: [], status: rawContent.status }; + } + // --- Start Pipeline Processing --- - let processed: ProcessedContent | undefined; + let processed: PipelineResult | undefined; for (const pipeline of this.pipelines) { - if (pipeline.canProcess(rawContent)) { + const contentBuffer = Buffer.isBuffer(rawContent.content) + ? rawContent.content + : Buffer.from(rawContent.content); + if (pipeline.canProcess(rawContent.mimeType || "text/plain", contentBuffer)) { logger.debug( `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${url})`, ); @@ -79,11 +97,11 @@ export class WebScraperStrategy extends BaseScraperStrategy { logger.warn( `⚠️ Unsupported content type "${rawContent.mimeType}" for URL ${url}. Skipping processing.`, ); - return { document: undefined, links: [] }; + return { url: rawContent.source, links: [], status: FetchStatus.SUCCESS }; } // Log errors from pipeline - for (const err of processed.errors) { + for (const err of processed.errors ?? []) { logger.warn(`⚠️ Processing error for ${url}: ${err.message}`); } @@ -92,7 +110,11 @@ export class WebScraperStrategy extends BaseScraperStrategy { logger.warn( `⚠️ No processable content found for ${url} after pipeline execution.`, ); - return { document: undefined, links: processed.links }; + return { + url: rawContent.source, + links: processed.links, + status: FetchStatus.SUCCESS, + }; } // Determine base for scope filtering: @@ -103,35 +125,28 @@ export class WebScraperStrategy extends BaseScraperStrategy { ? new URL(rawContent.source) : (this.canonicalBaseUrl ?? new URL(options.url)); - const filteredLinks = processed.links.filter((link) => { - try { - const targetUrl = new URL(link); - const scope = options.scope || "subpages"; - return ( - isInScope(baseUrl, targetUrl, scope) && - (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl)) - ); - } catch { - return false; - } - }); + const filteredLinks = + processed.links?.filter((link) => { + try { + const targetUrl = new URL(link); + const scope = options.scope || "subpages"; + return ( + isInScope(baseUrl, targetUrl, scope) && + (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl)) + ); + } catch { + return false; + } + }) ?? []; return { - document: { - content: processed.textContent, - metadata: { - url, - title: - typeof processed.metadata.title === "string" - ? processed.metadata.title - : "Untitled", - library: options.library, - version: options.version, - ...processed.metadata, - }, - } satisfies Document, + url: rawContent.source, + etag: rawContent.etag, + lastModified: rawContent.lastModified, + contentType: rawContent.mimeType, + content: processed, links: filteredLinks, - finalUrl: rawContent.source, + status: FetchStatus.SUCCESS, }; } catch (error) { // Log fetch errors or pipeline execution errors (if run throws) diff --git a/src/scraper/types.ts b/src/scraper/types.ts index 05b670d8..d392f4c2 100644 --- a/src/scraper/types.ts +++ b/src/scraper/types.ts @@ -1,4 +1,15 @@ -import type { Document, ProgressCallback } from "../types"; +import type { Chunk } from "../splitter/types"; +import type { ProgressCallback } from "../types"; + +/** + * Represents an item in the scraping queue + */ +export type QueueItem = { + url: string; + depth: number; + pageId?: number; // Database page ID for efficient deletion during refresh + etag?: string | null; // Last known ETag for conditional requests during refresh +}; /** * Enum defining the available HTML processing strategies. @@ -16,7 +27,7 @@ export interface ScraperStrategy { canHandle(url: string): boolean; scrape( options: ScraperOptions, - progressCallback: ProgressCallback, + progressCallback: ProgressCallback, signal?: AbortSignal, // Add optional signal ): Promise; @@ -28,7 +39,16 @@ export interface ScraperStrategy { } /** - * Options for configuring the scraping process + * Internal runtime options for configuring the scraping process. + * + * This is the comprehensive configuration object used by ScraperService, PipelineWorker, + * and scraper strategies. It includes both: + * - User-facing options (provided via tools like scrape_docs) + * - System-managed options (set internally by PipelineManager) + * + * Note: User-facing tools should NOT expose all these options directly. Instead, + * PipelineManager is responsible for translating user input into this complete + * runtime configuration. */ export interface ScraperOptions { url: string; @@ -76,28 +96,75 @@ export interface ScraperOptions { * Keys are header names, values are header values. */ headers?: Record; + /** + * Pre-populated queue of pages to visit. + * When provided: + * - Disables link discovery and crawling + * - Processes only the provided URLs + * - Uses provided metadata (pageId, etag) for optimization + */ + initialQueue?: QueueItem[]; + /** + * Indicates whether this is a refresh operation (re-indexing existing version). + * When true: + * - Skips initial removeAllDocuments call to preserve existing data + * - Uses ETags for conditional requests + * - Only updates changed/deleted pages + * @default false + */ + isRefresh?: boolean; } /** - * Result of scraping a single page. Used internally by HtmlScraper. + * Result of scraping a single page. */ -export interface ScrapedPage { - content: string; - title: string; +export interface ScrapeResult { + /** The URL of the page that was scraped */ url: string; - /** URLs extracted from page links, used for recursive scraping */ + /** Page title */ + title: string; + /** MIME type of the content being processed */ + contentType: string; + /** The final processed content, typically as a string (e.g., Markdown). Used primarily for debugging */ + textContent: string; + /** Extracted links from the content. */ links: string[]; + /** Any non-critical errors encountered during processing. */ + errors: Error[]; + /** Pre-split chunks from pipeline processing */ + chunks: Chunk[]; + /** ETag from HTTP response for caching */ + etag?: string | null; + /** Last-Modified from HTTP response for caching */ + lastModified?: string | null; } /** * Progress information during scraping */ -export interface ScraperProgress { +export interface ScraperProgressEvent { + /** Number of pages successfully scraped so far */ pagesScraped: number; - totalPages: number; // Effective total pages (limited by maxPages configuration) - totalDiscovered: number; // Actual number of pages discovered (may exceed totalPages) + /** + * Maximum number of pages to scrape (from maxPages option). + * May be undefined if no limit is set. + */ + totalPages: number; + /** + * Total number of URLs discovered during crawling. + * This may be higher than totalPages if maxPages limit is reached. + */ + totalDiscovered: number; + /** Current URL being processed */ currentUrl: string; + /** Current depth in the crawl tree */ depth: number; + /** Maximum depth allowed (from maxDepth option) */ maxDepth: number; - document?: Document; + /** The result of scraping the current page, if available. This may be null if the page has been deleted or if an error occurred. */ + result: ScrapeResult | null; + /** Database page ID (for refresh operations or tracking) */ + pageId?: number; + /** Indicates this page was deleted (404 during refresh or broken link) */ + deleted?: boolean; } diff --git a/src/services/workerService.ts b/src/services/workerService.ts index 1626b939..4f7e843b 100644 --- a/src/services/workerService.ts +++ b/src/services/workerService.ts @@ -47,7 +47,7 @@ export async function registerWorkerService(pipeline: IPipeline): Promise }, onJobError: async (job, error, document) => { logger.warn( - `⚠️ Job ${job.id} error ${document ? `on document ${document.metadata.url}` : ""}: ${error.message}`, + `⚠️ Job ${job.id} error ${document ? `on document ${document.url}` : ""}: ${error.message}`, ); // Use PostHog's native error tracking instead of custom events diff --git a/src/splitter/GreedySplitter.test.ts b/src/splitter/GreedySplitter.test.ts index 9a604190..8ef7e19c 100644 --- a/src/splitter/GreedySplitter.test.ts +++ b/src/splitter/GreedySplitter.test.ts @@ -1,12 +1,12 @@ import { describe, expect, it, vi } from "vitest"; import { GreedySplitter } from "./GreedySplitter"; import { SemanticMarkdownSplitter } from "./SemanticMarkdownSplitter"; -import type { ContentChunk } from "./types"; +import type { Chunk } from "./types"; vi.mock("../utils/logger"); // Mock SemanticMarkdownSplitter -const createMockSemanticSplitter = (chunks: ContentChunk[]) => { +const createMockSemanticSplitter = (chunks: Chunk[]) => { const mockSplitText = vi.fn().mockResolvedValue(chunks); const mockSemanticSplitter = { splitText: mockSplitText, @@ -23,7 +23,7 @@ describe("GreedySplitter", () => { }); it("should return the original chunk if it's within min and max size", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "This is a single chunk.", @@ -37,7 +37,7 @@ describe("GreedySplitter", () => { }); it("should concatenate chunks until minChunkSize is reached", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "Short text 1.", @@ -62,7 +62,7 @@ describe("GreedySplitter", () => { }); it("should respect H1/H2 boundaries", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "Text before heading.", @@ -102,7 +102,7 @@ describe("GreedySplitter", () => { }); it("should not exceed preferredChunkSize", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "This is a long text chunk. ", @@ -132,7 +132,7 @@ describe("GreedySplitter", () => { }); it("should preserve section metadata when concatenating chunks with identical sections", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "Short text 1.", @@ -157,7 +157,7 @@ describe("GreedySplitter", () => { }); it("should merge heading with its content when minChunkSize > 0", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["heading"], content: "# Section 1", @@ -182,7 +182,7 @@ describe("GreedySplitter", () => { }); it("should keep heading separate when minChunkSize = 0", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["heading"], content: "# Section 1", @@ -201,7 +201,7 @@ describe("GreedySplitter", () => { }); it("should use deeper path when merging parent with child section", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "Parent content", @@ -232,7 +232,7 @@ describe("GreedySplitter", () => { }); it("should use common parent when merging sibling sections", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "First subsection", @@ -266,7 +266,7 @@ describe("GreedySplitter", () => { }); it("should use root when merging sections with no common path", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "First section", @@ -300,7 +300,7 @@ describe("GreedySplitter", () => { }); it("should handle deeply nested sections", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ { types: ["text"], content: "Level 1", @@ -333,7 +333,7 @@ describe("GreedySplitter", () => { }); it("should handle deep sibling sections with common parent", async () => { - const initialChunks: ContentChunk[] = [ + const initialChunks: Chunk[] = [ // Deep sibling sections under Section 1 -> SubSection 1.1 { types: ["text"], diff --git a/src/splitter/GreedySplitter.ts b/src/splitter/GreedySplitter.ts index ba068f99..b1181bd1 100644 --- a/src/splitter/GreedySplitter.ts +++ b/src/splitter/GreedySplitter.ts @@ -1,4 +1,4 @@ -import type { ContentChunk, DocumentSplitter, SectionContentType } from "./types"; +import type { Chunk, DocumentSplitter, SectionContentType } from "./types"; /** * Takes small document chunks and greedily concatenates them into larger, more meaningful units @@ -36,10 +36,10 @@ export class GreedySplitter implements DocumentSplitter { * section boundaries to maintain document structure. This balances the need for * context with semantic coherence. */ - async splitText(markdown: string, contentType?: string): Promise { + async splitText(markdown: string, contentType?: string): Promise { const initialChunks = await this.baseSplitter.splitText(markdown, contentType); - const concatenatedChunks: ContentChunk[] = []; - let currentChunk: ContentChunk | null = null; + const concatenatedChunks: Chunk[] = []; + let currentChunk: Chunk | null = null; for (const nextChunk of initialChunks) { if (currentChunk) { @@ -71,7 +71,7 @@ export class GreedySplitter implements DocumentSplitter { return concatenatedChunks; } - private cloneChunk(chunk: ContentChunk): ContentChunk { + private cloneChunk(chunk: Chunk): Chunk { return { types: [...chunk.types], content: chunk.content, @@ -86,7 +86,7 @@ export class GreedySplitter implements DocumentSplitter { * H1 and H2 headings represent major conceptual breaks in the document. * Preserving these splits helps maintain the document's logical structure. */ - private startsNewMajorSection(chunk: ContentChunk): boolean { + private startsNewMajorSection(chunk: Chunk): boolean { return chunk.section.level === 1 || chunk.section.level === 2; } @@ -94,10 +94,7 @@ export class GreedySplitter implements DocumentSplitter { * Size limit check to ensure chunks remain within embedding model constraints. * Essential for maintaining consistent embedding quality and avoiding truncation. */ - private wouldExceedMaxSize( - currentChunk: ContentChunk | null, - nextChunk: ContentChunk, - ): boolean { + private wouldExceedMaxSize(currentChunk: Chunk | null, nextChunk: Chunk): boolean { if (!currentChunk) { return false; } @@ -122,10 +119,7 @@ export class GreedySplitter implements DocumentSplitter { * - For siblings/unrelated sections, uses the common parent path * - If no common path exists, uses the root path ([]) */ - private mergeSectionInfo( - currentChunk: ContentChunk, - nextChunk: ContentChunk, - ): ContentChunk["section"] { + private mergeSectionInfo(currentChunk: Chunk, nextChunk: Chunk): Chunk["section"] { // Always use the lowest level const level = Math.min(currentChunk.section.level, nextChunk.section.level); diff --git a/src/splitter/JsonDocumentSplitter.ts b/src/splitter/JsonDocumentSplitter.ts index 6bbc21fe..c06d10b7 100644 --- a/src/splitter/JsonDocumentSplitter.ts +++ b/src/splitter/JsonDocumentSplitter.ts @@ -13,7 +13,7 @@ * 5. Let GreedySplitter handle size optimization */ -import type { ContentChunk, DocumentSplitter } from "./types"; +import type { Chunk, DocumentSplitter } from "./types"; type JsonValue = | string @@ -35,10 +35,10 @@ export class JsonDocumentSplitter implements DocumentSplitter { this.preserveFormatting = options.preserveFormatting ?? true; } - async splitText(content: string, _contentType?: string): Promise { + async splitText(content: string, _contentType?: string): Promise { try { const parsed: JsonValue = JSON.parse(content); - const chunks: ContentChunk[] = []; + const chunks: Chunk[] = []; // Process the JSON structure recursively, starting with root path this.processValue(parsed, ["root"], 1, 0, chunks, true); @@ -64,7 +64,7 @@ export class JsonDocumentSplitter implements DocumentSplitter { path: string[], level: number, indentLevel: number, - chunks: ContentChunk[], + chunks: Chunk[], isLastItem: boolean, ): void { if (Array.isArray(value)) { @@ -81,7 +81,7 @@ export class JsonDocumentSplitter implements DocumentSplitter { path: string[], level: number, indentLevel: number, - chunks: ContentChunk[], + chunks: Chunk[], isLastItem: boolean, ): void { const indent = this.getIndent(indentLevel); @@ -114,7 +114,7 @@ export class JsonDocumentSplitter implements DocumentSplitter { path: string[], level: number, indentLevel: number, - chunks: ContentChunk[], + chunks: Chunk[], isLastItem: boolean, ): void { const indent = this.getIndent(indentLevel); @@ -157,7 +157,7 @@ export class JsonDocumentSplitter implements DocumentSplitter { path: string[], level: number, indentLevel: number, - chunks: ContentChunk[], + chunks: Chunk[], isLastProperty: boolean, ): void { const indent = this.getIndent(indentLevel); @@ -189,7 +189,7 @@ export class JsonDocumentSplitter implements DocumentSplitter { path: string[], level: number, indentLevel: number, - chunks: ContentChunk[], + chunks: Chunk[], isLastItem: boolean, ): void { const indent = this.getIndent(indentLevel); diff --git a/src/splitter/SemanticMarkdownSplitter.ts b/src/splitter/SemanticMarkdownSplitter.ts index ddeb9de7..c53e85bb 100644 --- a/src/splitter/SemanticMarkdownSplitter.ts +++ b/src/splitter/SemanticMarkdownSplitter.ts @@ -11,7 +11,7 @@ import { ContentSplitterError, MinimumChunkSizeError } from "./errors"; import { CodeContentSplitter } from "./splitters/CodeContentSplitter"; import { TableContentSplitter } from "./splitters/TableContentSplitter"; import { TextContentSplitter } from "./splitters/TextContentSplitter"; -import type { ContentChunk, DocumentSplitter, SectionContentType } from "./types"; +import type { Chunk, DocumentSplitter, SectionContentType } from "./types"; /** * Represents a section of content within a document, @@ -101,7 +101,7 @@ export class SemanticMarkdownSplitter implements DocumentSplitter { /** * Main entry point for splitting markdown content */ - async splitText(markdown: string, _contentType?: string): Promise { + async splitText(markdown: string, _contentType?: string): Promise { // Note: JSON content is now handled by dedicated JsonDocumentSplitter in JsonPipeline // This splitter focuses on markdown, HTML, and plain text content @@ -219,10 +219,8 @@ export class SemanticMarkdownSplitter implements DocumentSplitter { /** * Step 2: Split section content into smaller chunks */ - private async splitSectionContent( - sections: DocumentSection[], - ): Promise { - const chunks: ContentChunk[] = []; + private async splitSectionContent(sections: DocumentSection[]): Promise { + const chunks: Chunk[] = []; for (const section of sections) { for (const content of section.content) { @@ -296,7 +294,7 @@ export class SemanticMarkdownSplitter implements DocumentSplitter { // Create chunks from split content chunks.push( ...splitContent.map( - (text): ContentChunk => ({ + (text): Chunk => ({ types: [content.type], content: text, section: { diff --git a/src/splitter/TextDocumentSplitter.ts b/src/splitter/TextDocumentSplitter.ts index 42748bca..f64b45ce 100644 --- a/src/splitter/TextDocumentSplitter.ts +++ b/src/splitter/TextDocumentSplitter.ts @@ -9,7 +9,7 @@ import { SPLITTER_MAX_CHUNK_SIZE } from "../utils"; import { TextContentSplitter } from "./splitters/TextContentSplitter"; -import type { ContentChunk, DocumentSplitter } from "./types"; +import type { Chunk, DocumentSplitter } from "./types"; /** * Configuration options for text document splitting @@ -39,7 +39,7 @@ export class TextDocumentSplitter implements DocumentSplitter { }); } - async splitText(content: string): Promise { + async splitText(content: string): Promise { if (!content.trim()) { return []; } diff --git a/src/splitter/treesitter/TreesitterSourceCodeSplitter.ts b/src/splitter/treesitter/TreesitterSourceCodeSplitter.ts index 52b38d4c..1d6a7b92 100644 --- a/src/splitter/treesitter/TreesitterSourceCodeSplitter.ts +++ b/src/splitter/treesitter/TreesitterSourceCodeSplitter.ts @@ -8,7 +8,7 @@ import { SPLITTER_MAX_CHUNK_SIZE } from "../../utils"; import { TextContentSplitter } from "../splitters/TextContentSplitter"; -import type { ContentChunk, DocumentSplitter } from "../types"; +import type { Chunk, DocumentSplitter } from "../types"; import { LanguageParserRegistry } from "./LanguageParserRegistry"; import type { CodeBoundary, LanguageParser } from "./parsers/types"; @@ -41,7 +41,7 @@ export class TreesitterSourceCodeSplitter implements DocumentSplitter { }); } - async splitText(content: string, contentType?: string): Promise { + async splitText(content: string, contentType?: string): Promise { if (!content.trim()) { return []; } @@ -89,7 +89,7 @@ export class TreesitterSourceCodeSplitter implements DocumentSplitter { /** * Helper method to fall back to TextContentSplitter and convert results to ContentChunk[] */ - private async fallbackToTextSplitter(content: string): Promise { + private async fallbackToTextSplitter(content: string): Promise { const textChunks = await this.textContentSplitter.split(content); return textChunks.map((chunk) => ({ types: ["code"], @@ -173,7 +173,7 @@ export class TreesitterSourceCodeSplitter implements DocumentSplitter { content: string, path: string[], level: number, - ): Promise { + ): Promise { // Preserve whitespace-only content if it fits within chunk size (for perfect reconstruction) // Only skip if content is completely empty if (content.length === 0) { @@ -223,7 +223,7 @@ export class TreesitterSourceCodeSplitter implements DocumentSplitter { boundaries: CodeBoundary[], content: string, _contentType?: string, - ): Promise { + ): Promise { const lines = content.split("\n"); const totalLines = lines.length; @@ -299,7 +299,7 @@ export class TreesitterSourceCodeSplitter implements DocumentSplitter { } // Step 4: Convert segments directly to chunks (whitespace retained verbatim) - const chunks: ContentChunk[] = []; + const chunks: Chunk[] = []; // Ensure only ONE structural chunk is emitted per structural boundary. const structuralBoundaryFirstChunk = new Set(); diff --git a/src/splitter/types.ts b/src/splitter/types.ts index 40086eb7..0d48ccc2 100644 --- a/src/splitter/types.ts +++ b/src/splitter/types.ts @@ -6,7 +6,7 @@ export type SectionContentType = "text" | "code" | "table" | "heading" | "struct /** * Final output chunk after processing and size-based splitting */ -export interface ContentChunk { +export interface Chunk { types: SectionContentType[]; content: string; section: { @@ -19,5 +19,5 @@ export interface ContentChunk { * Interface for a splitter that processes markdown content into chunks */ export interface DocumentSplitter { - splitText(markdown: string, contentType?: string): Promise; + splitText(markdown: string, contentType?: string): Promise; } diff --git a/src/store/DocumentManagementService.test.ts b/src/store/DocumentManagementService.test.ts index 343b32d7..881be474 100644 --- a/src/store/DocumentManagementService.test.ts +++ b/src/store/DocumentManagementService.test.ts @@ -1,12 +1,7 @@ import path from "node:path"; -import { Document } from "@langchain/core/documents"; import { createFsFromVolume, vol } from "memfs"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; -import { - LibraryNotFoundInStoreError, - StoreError, - VersionNotFoundInStoreError, -} from "./errors"; +import { LibraryNotFoundInStoreError, VersionNotFoundInStoreError } from "./errors"; vi.mock("node:fs", () => ({ default: createFsFromVolume(vol), @@ -38,7 +33,7 @@ const mockStore = { checkDocumentExists: vi.fn(), queryLibraryVersions: vi.fn().mockResolvedValue(new Map()), addDocuments: vi.fn(), - deleteDocuments: vi.fn(), + deletePages: vi.fn(), // Status tracking methods updateVersionStatus: vi.fn(), updateVersionProgress: vi.fn(), @@ -270,124 +265,22 @@ describe("DocumentManagementService", () => { expect(mockStore.checkDocumentExists).toHaveBeenCalledWith("test-lib", "1.0.0"); }); - describe("document processing", () => { - it("should add and search documents with basic metadata", async () => { - const library = "test-lib"; - const version = "1.0.0"; - const validDocument = new Document({ - pageContent: "Test document content about testing", - metadata: { - url: "http://example.com", - title: "Test Doc", - }, - }); - - const documentNoUrl = new Document({ - pageContent: "Test document without URL", - metadata: { - title: "Test Doc", - }, - }); - - // Should fail when URL is missing - await expect( - docService.addDocument(library, version, documentNoUrl), - ).rejects.toThrow(StoreError); - - await expect( - docService.addDocument(library, version, documentNoUrl), - ).rejects.toHaveProperty("message", "Document metadata must include a valid URL"); - - // Should succeed with valid URL - mockRetriever.search.mockResolvedValue(["Mocked search result"]); - - await docService.addDocument(library, version, validDocument); - - const results = await docService.searchStore(library, version, "testing"); - expect(mockStore.addDocuments).toHaveBeenCalledWith( - // Fix: Use mockStoreInstance - library, - version, - expect.arrayContaining([ - expect.objectContaining({ pageContent: validDocument.pageContent }), - ]), - ); - expect(results).toEqual(["Mocked search result"]); // Expect mocked result - }); - - it("should preserve semantic metadata when processing markdown documents", async () => { - const library = "test-lib"; - const version = "1.0.0"; - const document = new Document({ - pageContent: "# Chapter 1\nTest content\n## Section 1.1\nMore testing content", - metadata: { - url: "http://example.com/docs", - title: "Root Doc", - }, - }); - - // Mock the search result to match what would actually be stored after processing - mockRetriever.search.mockResolvedValue(["Mocked search result"]); - - await docService.addDocument(library, version, document); - - // Verify the documents were stored with semantic metadata - expect(mockStore.addDocuments).toHaveBeenCalledWith( - library, - version, - expect.arrayContaining([ - expect.objectContaining({ - metadata: expect.objectContaining({ - level: 0, - path: [], - }), - }), - ]), - ); - - // Verify search results preserve metadata - const results = await docService.searchStore(library, version, "testing"); - expect(results).toEqual(["Mocked search result"]); - }); - - it("should handle unsupported content types gracefully", async () => { - const library = "test-lib"; - const version = "1.0.0"; - const binaryDocument = new Document({ - pageContent: "binary content with null bytes\0", - metadata: { - url: "http://example.com/image.png", - title: "Binary Image", - mimeType: "image/png", - }, - }); - - // Should not throw an error, just log a warning and return early - await expect( - docService.addDocument(library, version, binaryDocument), - ).resolves.toBeUndefined(); - - // Verify that no documents were added to the store - expect(mockStore.addDocuments).not.toHaveBeenCalled(); - }); - }); - it("should remove all documents for a specific library and version", async () => { const library = "test-lib"; const version = "1.0.0"; await docService.removeAllDocuments(library, version); - expect(mockStore.deleteDocuments).toHaveBeenCalledWith(library, version); // Fix: Use mockStoreInstance + expect(mockStore.deletePages).toHaveBeenCalledWith(library, version); // Fix: Use mockStoreInstance }); it("should handle removing documents with null/undefined/empty version", async () => { const library = "test-lib"; await docService.removeAllDocuments(library, null); - expect(mockStore.deleteDocuments).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance + expect(mockStore.deletePages).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance await docService.removeAllDocuments(library, undefined); - expect(mockStore.deleteDocuments).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance + expect(mockStore.deletePages).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance await docService.removeAllDocuments(library, ""); - expect(mockStore.deleteDocuments).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance + expect(mockStore.deletePages).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance }); describe("listVersions", () => { @@ -768,46 +661,21 @@ describe("DocumentManagementService", () => { // Tests for handling optional version parameter (null/undefined/"") describe("Optional Version Handling", () => { const library = "opt-lib"; - const doc = new Document({ - pageContent: "Optional version test", - metadata: { url: "http://opt.com" }, - }); const query = "optional"; it("exists should normalize version to empty string", async () => { await docService.exists(library, null); - expect(mockStore.checkDocumentExists).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance + expect(mockStore.checkDocumentExists).toHaveBeenCalledWith(library, ""); await docService.exists(library, undefined); - expect(mockStore.checkDocumentExists).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance + expect(mockStore.checkDocumentExists).toHaveBeenCalledWith(library, ""); await docService.exists(library, ""); - expect(mockStore.checkDocumentExists).toHaveBeenCalledWith(library, ""); // Fix: Use mockStoreInstance - }); - - it("addDocument should normalize version to empty string", async () => { - await docService.addDocument(library, null, doc); - expect(mockStore.addDocuments).toHaveBeenCalledWith( - library, - "", - expect.any(Array), - ); // Fix: Use mockStoreInstance - await docService.addDocument(library, undefined, doc); - expect(mockStore.addDocuments).toHaveBeenCalledWith( - library, - "", - expect.any(Array), - ); // Fix: Use mockStoreInstance - await docService.addDocument(library, "", doc); - expect(mockStore.addDocuments).toHaveBeenCalledWith( - library, - "", - expect.any(Array), - ); // Fix: Use mockStoreInstance + expect(mockStore.checkDocumentExists).toHaveBeenCalledWith(library, ""); }); it("searchStore should normalize version to empty string", async () => { // Call without explicit limit, should use default limit of 5 await docService.searchStore(library, null, query); - expect(mockRetriever.search).toHaveBeenCalledWith(library, "", query, 5); // Expect default limit 5 + expect(mockRetriever.search).toHaveBeenCalledWith(library, "", query, 5); // Call with explicit limit await docService.searchStore(library, undefined, query, 7); diff --git a/src/store/DocumentManagementService.ts b/src/store/DocumentManagementService.ts index 7b0df7f4..7f660cd0 100644 --- a/src/store/DocumentManagementService.ts +++ b/src/store/DocumentManagementService.ts @@ -1,5 +1,4 @@ import path from "node:path"; -import type { Document } from "@langchain/core/documents"; import Fuse from "fuse.js"; import semver from "semver"; import { @@ -7,9 +6,8 @@ import { PipelineFactory, } from "../scraper/pipelines/PipelineFactory"; import type { ContentPipeline } from "../scraper/pipelines/types"; -import type { ScraperOptions } from "../scraper/types"; -import { ScrapeMode } from "../scraper/types"; -import type { ContentChunk } from "../splitter/types"; +import type { ScrapeResult, ScraperOptions } from "../scraper/types"; +import type { Chunk } from "../splitter/types"; import { analytics, extractHostname, TelemetryEvent } from "../telemetry"; import { logger } from "../utils/logger"; import { DocumentRetrieverService } from "./DocumentRetrieverService"; @@ -53,10 +51,11 @@ export class DocumentManagementService { embeddingConfig?: EmbeddingModelConfig | null, pipelineConfig?: PipelineConfiguration, ) { - const dbDir = storePath; - const dbPath = path.join(dbDir, "documents.db"); + // Handle special :memory: case for in-memory databases (primarily for testing) + const dbPath = + storePath === ":memory:" ? ":memory:" : path.join(storePath, "documents.db"); - logger.debug(`Using database directory: ${dbDir}`); + logger.debug(`Using database path: ${dbPath}`); // Directory creation is handled by the centralized path resolution @@ -335,10 +334,31 @@ export class DocumentManagementService { logger.info( `🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`, ); - const count = await this.store.deleteDocuments(library, normalizedVersion); + const count = await this.store.deletePages(library, normalizedVersion); logger.info(`🗑️ Deleted ${count} documents`); } + /** + * Deletes a page and all its associated document chunks. + * This is used during refresh operations when a page returns 404 Not Found. + */ + async deletePage(pageId: number): Promise { + logger.debug(`Deleting page ID: ${pageId}`); + await this.store.deletePage(pageId); + } + + /** + * Retrieves all pages for a specific version ID with their metadata. + * Used for refresh operations to get existing pages with their ETags and depths. + */ + async getPagesByVersionId( + versionId: number, + ): Promise< + Array<{ id: number; url: string; etag: string | null; depth: number | null }> + > { + return this.store.getPagesByVersionId(versionId); + } + /** * Completely removes a library version and all associated documents. * Also removes the library if no other versions remain. @@ -347,18 +367,16 @@ export class DocumentManagementService { */ async removeVersion(library: string, version?: string | null): Promise { const normalizedVersion = this.normalizeVersion(version); - logger.info(`🗑️ Removing version: ${library}@${normalizedVersion || "[no version]"}`); + logger.debug(`Removing version: ${library}@${normalizedVersion || "[no version]"}`); const result = await this.store.removeVersion(library, normalizedVersion, true); - logger.info( - `🗑️ Removed ${result.documentsDeleted} documents, version: ${result.versionDeleted}, library: ${result.libraryDeleted}`, - ); + logger.info(`🗑️ Removed ${result.documentsDeleted} documents`); if (result.versionDeleted && result.libraryDeleted) { - logger.info(`✅ Completely removed library ${library} (was last version)`); + logger.info(`🗑️ Completely removed library ${library} (was last version)`); } else if (result.versionDeleted) { - logger.info(`✅ Removed version ${library}@${normalizedVersion || "[no version]"}`); + logger.info(`🗑️ Removed version ${library}@${normalizedVersion || "[no version]"}`); } else { logger.warn( `⚠️ Version ${library}@${normalizedVersion || "[no version]"} not found`, @@ -367,108 +385,70 @@ export class DocumentManagementService { } /** - * Adds a document to the store, splitting it into smaller chunks for better search results. - * Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting. - * Preserves hierarchical structure of documents and distinguishes between text and code segments. - * If version is omitted, the document is added without a specific version. + * Adds pre-processed content directly to the store. + * This method is used when content has already been processed by a pipeline, + * avoiding redundant processing. Used primarily by the scraping pipeline. + * + * @param library Library name + * @param version Version string (null/undefined for unversioned) + * @param processed Pre-processed content with chunks already created + * @param pageId Optional page ID for refresh operations */ - async addDocument( + async addScrapeResult( library: string, version: string | null | undefined, - document: Document, + depth: number, + result: ScrapeResult, ): Promise { const processingStart = performance.now(); const normalizedVersion = this.normalizeVersion(version); - const url = document.metadata.url as string; - - if (!url || typeof url !== "string" || !url.trim()) { - throw new StoreError("Document metadata must include a valid URL"); + const { url, title, chunks, contentType } = result; + if (!url) { + throw new StoreError("Processed content metadata must include a valid URL"); } - logger.info(`📚 Adding document: ${document.metadata.title}`); + logger.info(`📚 Adding processed content: ${title || url}`); - if (!document.pageContent.trim()) { - throw new Error("Document content cannot be empty"); + if (chunks.length === 0) { + logger.warn(`⚠️ No chunks in processed content for ${url}. Skipping.`); + return; } - const contentType = document.metadata.mimeType as string | undefined; - try { - // Create a mock RawContent for pipeline selection - const rawContent = { - source: url, - content: document.pageContent, - mimeType: contentType || "text/plain", - }; - - // Find appropriate pipeline for content type - const pipeline = this.pipelines.find((p) => p.canProcess(rawContent)); - - if (!pipeline) { - logger.warn( - `⚠️ Unsupported content type "${rawContent.mimeType}" for document ${url}. Skipping processing.`, - ); - return; - } - - // Debug logging for pipeline selection - logger.debug( - `Selected ${pipeline.constructor.name} for content type "${rawContent.mimeType}" (${url})`, - ); - - // Use content-type-specific pipeline for processing and splitting - // Create minimal scraper options for processing - const scraperOptions = { - url: url, - library: library, - version: normalizedVersion, - scrapeMode: ScrapeMode.Fetch, - ignoreErrors: false, - maxConcurrency: 1, - }; - - const processed = await pipeline.process(rawContent, scraperOptions); - const chunks = processed.chunks; - - // Convert semantic chunks to documents - const splitDocs = chunks.map((chunk: ContentChunk) => ({ - pageContent: chunk.content, - metadata: { - ...document.metadata, - level: chunk.section.level, - path: chunk.section.path, - }, - })); - logger.info(`✂️ Split document into ${splitDocs.length} chunks`); + logger.info(`✂️ Storing ${chunks.length} pre-split chunks`); // Add split documents to store - await this.store.addDocuments(library, normalizedVersion, splitDocs); + await this.store.addDocuments(library, normalizedVersion, depth, result); // Track successful document processing const processingTime = performance.now() - processingStart; + const totalContentSize = chunks.reduce( + (sum: number, chunk: Chunk) => sum + chunk.content.length, + 0, + ); + analytics.track(TelemetryEvent.DOCUMENT_PROCESSED, { // Content characteristics (privacy-safe) - mimeType: contentType || "unknown", - contentSizeBytes: document.pageContent.length, + mimeType: contentType, + contentSizeBytes: totalContentSize, // Processing metrics processingTimeMs: Math.round(processingTime), - chunksCreated: splitDocs.length, + chunksCreated: chunks.length, // Document characteristics - hasTitle: !!document.metadata.title, - hasDescription: !!document.metadata.description, + hasTitle: !!title, urlDomain: extractHostname(url), - depth: document.metadata.depth, + depth, // Library context library, libraryVersion: normalizedVersion || null, // Processing efficiency - avgChunkSizeBytes: Math.round(document.pageContent.length / splitDocs.length), + avgChunkSizeBytes: Math.round(totalContentSize / chunks.length), processingSpeedKbPerSec: Math.round( - document.pageContent.length / 1024 / (processingTime / 1000), + totalContentSize / 1024 / (processingTime / 1000), ), }); } catch (error) { @@ -477,12 +457,15 @@ export class DocumentManagementService { if (error instanceof Error) { analytics.captureException(error, { - mimeType: contentType || "unknown", - contentSizeBytes: document.pageContent.length, + mimeType: contentType, + contentSizeBytes: chunks.reduce( + (sum: number, chunk: Chunk) => sum + chunk.content.length, + 0, + ), processingTimeMs: Math.round(processingTime), library, libraryVersion: normalizedVersion || null, - context: "document_processing", + context: "processed_content_storage", component: DocumentManagementService.constructor.name, }); } diff --git a/src/store/DocumentRetrieverService.test.ts b/src/store/DocumentRetrieverService.test.ts index d01492c6..8b94e7e0 100644 --- a/src/store/DocumentRetrieverService.test.ts +++ b/src/store/DocumentRetrieverService.test.ts @@ -1,7 +1,7 @@ -import { Document } from "@langchain/core/documents"; import { beforeEach, describe, expect, it, vi } from "vitest"; import { DocumentRetrieverService } from "./DocumentRetrieverService"; import { DocumentStore } from "./DocumentStore"; +import type { DbChunkRank, DbPageChunk } from "./types"; vi.mock("./DocumentStore"); vi.mock("../utils/logger"); @@ -27,21 +27,26 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const version = "1.0.0"; const query = "test"; // Two initial hits from the same URL, with overlapping context - const initialResult1 = new Document({ + const initialResult1 = { id: "doc1", - pageContent: "Chunk A", - metadata: { url: "url", score: 0.9 }, - }); - const initialResult2 = new Document({ + content: "Chunk A", + url: "url", + score: 0.9, + metadata: {}, + } as DbPageChunk & DbChunkRank; + const initialResult2 = { id: "doc3", - pageContent: "Chunk C", - metadata: { url: "url", score: 0.8 }, - }); - const doc2 = new Document({ + content: "Chunk C", + url: "url", + score: 0.8, + metadata: {}, + } as DbPageChunk & DbChunkRank; + const doc2 = { id: "doc2", - pageContent: "Chunk B", - metadata: { url: "url" }, - }); + content: "Chunk B", + url: "url", + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([ initialResult1, @@ -85,21 +90,25 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const library = "lib"; const version = "1.0.0"; const query = "test"; - const initialResult = new Document({ + const initialResult = { id: "doc1", - pageContent: "Main chunk", - metadata: { url: "url", score: 0.7 }, - }); - const parent = new Document({ + content: "Main chunk", + score: 0.7, + url: "url", + metadata: {}, + } as DbPageChunk & DbChunkRank; + const parent = { id: "parent1", - pageContent: "Parent", - metadata: { url: "url" }, - }); - const child = new Document({ + content: "Parent", + url: "url", + metadata: {}, + } as DbPageChunk & DbChunkRank; + const child = { id: "child1", - pageContent: "Child", - metadata: { url: "url" }, - }); + content: "Child", + url: "url", + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([initialResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(parent); @@ -130,16 +139,20 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const library = "lib"; const version = "1.0.0"; const query = "test"; - const docA = new Document({ + const docA = { id: "a1", - pageContent: "A1", - metadata: { url: "urlA", score: 0.8 }, - }); - const docB = new Document({ + content: "A1", + url: "urlA", + score: 0.8, + metadata: {}, + } as DbPageChunk & DbChunkRank; + const docB = { id: "b1", - pageContent: "B1", - metadata: { url: "urlB", score: 0.9 }, - }); + content: "B1", + url: "urlB", + score: 0.9, + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([docA, docB]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -174,11 +187,13 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const library = "lib"; const version = "1.0.0"; const query = "test"; - const initialResult = new Document({ + const initialResult = { id: "doc1", - pageContent: "Main chunk", - metadata: { url: "url", score: 0.5 }, - }); + content: "Main chunk", + url: "url", + score: 0.5, + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([initialResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -210,11 +225,13 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const version = "1.0.0"; const query = "test"; const limit = 3; - const initialResult = new Document({ + const initialResult = { id: "doc1", - pageContent: "Main chunk", - metadata: { url: "url", score: 0.5 }, - }); + content: "Main chunk", + url: "url", + score: 0.5, + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([initialResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -247,11 +264,14 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const mimeType = "text/html"; // Create a document with mimeType in metadata - const initialResult = new Document({ + const initialResult = { id: "doc1", - pageContent: "HTML content", - metadata: { url: "https://example.com", score: 0.9, mimeType }, - }); + content: "HTML content", + url: "https://example.com", + score: 0.9, + content_type: mimeType, + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([initialResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -277,11 +297,13 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const query = "test"; // Create a document without mimeType in metadata - const initialResult = new Document({ + const initialResult = { id: "doc1", - pageContent: "Plain content", - metadata: { url: "https://example.com", score: 0.9 }, - }); + content: "Plain content", + url: "https://example.com", + score: 0.9, + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([initialResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -308,27 +330,27 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const query = "test"; // Child chunk with path ["Chapter 1", "Section 1.1"] - const childResult = new Document({ + const childResult = { id: "child1", - pageContent: "Child content", + content: "Child content", + url: "https://example.com", + score: 0.8, metadata: { - url: "https://example.com", - score: 0.8, path: ["Chapter 1", "Section 1.1"], level: 2, }, - }); + } as DbPageChunk & DbChunkRank; // Parent chunk with path ["Chapter 1"] - const parentChunk = new Document({ + const parentChunk = { id: "parent1", - pageContent: "Parent content", + content: "Parent content", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1"], level: 1, }, - }); + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([childResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(parentChunk); @@ -363,38 +385,38 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const query = "test"; // Main result chunk - const mainResult = new Document({ + const mainResult = { id: "main1", - pageContent: "Main content", + content: "Main content", + url: "https://example.com", + score: 0.9, metadata: { - url: "https://example.com", - score: 0.9, path: ["Chapter 1", "Section 1.2"], level: 2, }, - }); + } as DbPageChunk & DbChunkRank; // Preceding sibling with same path level - const precedingSibling = new Document({ + const precedingSibling = { id: "preceding1", - pageContent: "Preceding content", + content: "Preceding content", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1", "Section 1.1"], level: 2, }, - }); + } as DbPageChunk & DbChunkRank; // Subsequent sibling with same path level - const subsequentSibling = new Document({ + const subsequentSibling = { id: "subsequent1", - pageContent: "Subsequent content", + content: "Subsequent content", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1", "Section 1.3"], level: 2, }, - }); + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([mainResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -441,37 +463,37 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const query = "test"; // Parent result chunk - const parentResult = new Document({ + const parentResult = { id: "parent1", - pageContent: "Parent section", + content: "Parent section", + url: "https://example.com", + score: 0.7, metadata: { - url: "https://example.com", - score: 0.7, path: ["Chapter 1"], level: 1, }, - }); + } as DbPageChunk & DbChunkRank; // Child chunks at deeper level - const child1 = new Document({ + const child1 = { id: "child1", - pageContent: "First subsection", + content: "First subsection", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1", "Section 1.1"], level: 2, }, - }); + } as DbPageChunk & DbChunkRank; - const child2 = new Document({ + const child2 = { id: "child2", - pageContent: "Second subsection", + content: "Second subsection", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1", "Section 1.2"], level: 2, }, - }); + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([parentResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -508,27 +530,27 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const query = "test"; // Multiple chunks from same document/URL, returned out of sort_order - const chunk3 = new Document({ + const chunk3 = { id: "chunk3", - pageContent: "Third chunk", + content: "Third chunk", + url: "https://example.com", + score: 0.6, metadata: { - url: "https://example.com", - score: 0.6, path: ["Section C"], level: 1, }, - }); + } as DbPageChunk & DbChunkRank; - const chunk1 = new Document({ + const chunk1 = { id: "chunk1", - pageContent: "First chunk", + content: "First chunk", + url: "https://example.com", + score: 0.8, metadata: { - url: "https://example.com", - score: 0.8, path: ["Section A"], level: 1, }, - }); + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([chunk3, chunk1]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -558,60 +580,60 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const query = "test"; // Main search result - a subsection - const mainResult = new Document({ + const mainResult = { id: "main1", - pageContent: "Key subsection content", + content: "Key subsection content", + url: "https://example.com", + score: 0.9, metadata: { - url: "https://example.com", - score: 0.9, path: ["Guide", "Installation", "Setup"], level: 3, }, - }); + } as DbPageChunk & DbChunkRank; // Parent at level 2 - const parent = new Document({ + const parent = { id: "parent1", - pageContent: "Installation overview", + content: "Installation overview", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Guide", "Installation"], level: 2, }, - }); + } as DbPageChunk & DbChunkRank; // Preceding sibling at same level - const precedingSibling = new Document({ + const precedingSibling = { id: "preceding1", - pageContent: "Prerequisites section", + content: "Prerequisites section", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Guide", "Installation", "Prerequisites"], level: 3, }, - }); + } as DbPageChunk & DbChunkRank; // Child at deeper level - const child = new Document({ + const child = { id: "child1", - pageContent: "Detailed setup steps", + content: "Detailed setup steps", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Guide", "Installation", "Setup", "Steps"], level: 4, }, - }); + } as DbPageChunk & DbChunkRank; // Subsequent sibling - const subsequentSibling = new Document({ + const subsequentSibling = { id: "subsequent1", - pageContent: "Configuration section", + content: "Configuration section", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Guide", "Installation", "Configuration"], level: 3, }, - }); + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([mainResult]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(parent); @@ -652,15 +674,14 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const version = "1.0.0"; const query = "test"; - const markdownChunk = new Document({ + const markdownChunk = { id: "md1", - pageContent: "# Heading\n\nSome content", - metadata: { - url: "https://example.com/doc.md", - score: 0.9, - mimeType: "text/markdown", - }, - }); + content: "# Heading\n\nSome content", + url: "https://example.com/doc.md", + score: 0.9, + content_type: "text/markdown", + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([markdownChunk]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -685,15 +706,14 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const version = "1.0.0"; const query = "test"; - const codeChunk = new Document({ + const codeChunk = { id: "ts1", - pageContent: "function test() {\n return 'hello';\n}", - metadata: { - url: "https://example.com/code.ts", - score: 0.9, - mimeType: "text/x-typescript", - }, - }); + content: "function test() {\n return 'hello';\n}", + url: "https://example.com/code.ts", + score: 0.9, + content_type: "text/x-typescript", + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([codeChunk]); // Mock the hierarchical strategy's fallback behavior since we don't have full hierarchy implementation @@ -717,15 +737,14 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const version = "1.0.0"; const query = "test"; - const jsonChunk = new Document({ + const jsonChunk = { id: "json1", - pageContent: '{"key": "value"}', - metadata: { - url: "https://example.com/config.json", - score: 0.9, - mimeType: "application/json", - }, - }); + content: '{"key": "value"}', + url: "https://example.com/config.json", + score: 0.9, + content_type: "application/json", + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([jsonChunk]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); @@ -748,15 +767,14 @@ describe("DocumentRetrieverService (consolidated logic)", () => { const version = "1.0.0"; const query = "test"; - const unknownChunk = new Document({ + const unknownChunk = { id: "unknown1", - pageContent: "Some content", - metadata: { - url: "https://example.com/unknown", - score: 0.9, - // No mimeType specified - }, - }); + content: "Some content", + url: "https://example.com/unknown", + score: 0.9, + // No mimeType specified + metadata: {}, + } as DbPageChunk & DbChunkRank; vi.spyOn(mockDocumentStore, "findByContent").mockResolvedValue([unknownChunk]); vi.spyOn(mockDocumentStore, "findParentChunk").mockResolvedValue(null); diff --git a/src/store/DocumentRetrieverService.ts b/src/store/DocumentRetrieverService.ts index f0633a34..bfd66cab 100644 --- a/src/store/DocumentRetrieverService.ts +++ b/src/store/DocumentRetrieverService.ts @@ -1,7 +1,6 @@ -import type { Document } from "@langchain/core/documents"; import { createContentAssemblyStrategy } from "./assembly/ContentAssemblyStrategyFactory"; import type { DocumentStore } from "./DocumentStore"; -import type { StoreSearchResult } from "./types"; +import type { DbChunkRank, DbPageChunk, StoreSearchResult } from "./types"; export class DocumentRetrieverService { private documentStore: DocumentStore; @@ -59,11 +58,13 @@ export class DocumentRetrieverService { /** * Groups search results by URL. */ - private groupResultsByUrl(results: Document[]): Map { - const resultsByUrl = new Map(); + private groupResultsByUrl( + results: (DbPageChunk & DbChunkRank)[], + ): Map { + const resultsByUrl = new Map(); for (const result of results) { - const url = result.metadata.url as string; + const url = result.url; if (!resultsByUrl.has(url)) { resultsByUrl.set(url, []); } @@ -83,18 +84,14 @@ export class DocumentRetrieverService { library: string, version: string, url: string, - initialChunks: Document[], + initialChunks: (DbPageChunk & DbChunkRank)[], ): Promise { - // Extract mimeType from the first document's metadata - const mimeType = - initialChunks.length > 0 - ? (initialChunks[0].metadata.mimeType as string | undefined) - : undefined; + // Extract mimeType from the first document's content_type (page-level field) + // Convert null to undefined for consistency + const mimeType = initialChunks.length > 0 ? initialChunks[0].content_type : undefined; // Find the maximum score from the initial results - const maxScore = Math.max( - ...initialChunks.map((chunk) => chunk.metadata.score as number), - ); + const maxScore = Math.max(...initialChunks.map((chunk) => chunk.score)); // Create appropriate assembly strategy based on content type const strategy = createContentAssemblyStrategy(mimeType); diff --git a/src/store/DocumentStore.test.ts b/src/store/DocumentStore.test.ts index be1f6de6..716cba07 100644 --- a/src/store/DocumentStore.test.ts +++ b/src/store/DocumentStore.test.ts @@ -1,5 +1,6 @@ -import type { Document } from "@langchain/core/documents"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { ScrapeResult } from "../scraper/types"; +import type { Chunk } from "../splitter/types"; import { DocumentStore } from "./DocumentStore"; import { EmbeddingConfig } from "./embeddings/EmbeddingConfig"; import { VersionStatus } from "./types"; @@ -62,6 +63,41 @@ vi.mock("./embeddings/EmbeddingFactory", async (importOriginal) => { }; }); +/** + * Helper function to create minimal ScrapeResult for testing. + * Converts simplified test data to the ScrapeResult format expected by addDocuments. + */ +function createScrapeResult( + title: string, + url: string, + content: string, + path: string[] = [], + options?: { + etag?: string | null; + lastModified?: string | null; + }, +): ScrapeResult { + const chunks: Chunk[] = [ + { + types: ["text"], + content, + section: { level: 0, path }, + }, + ]; + + return { + url, + title, + contentType: "text/html", + textContent: content, + links: [], + errors: [], + chunks, + etag: options?.etag, + lastModified: options?.lastModified, + } satisfies ScrapeResult; +} + /** * Tests for DocumentStore with embeddings enabled * Uses explicit embedding configuration and tests hybrid search functionality @@ -88,26 +124,29 @@ describe("DocumentStore - With Embeddings", () => { describe("Document Storage and Retrieval", () => { it("should store and retrieve documents with proper metadata", async () => { - const docs: Document[] = [ - { - pageContent: "JavaScript programming tutorial with examples", - metadata: { - title: "JS Tutorial", - url: "https://example.com/js-tutorial", - path: ["programming", "javascript"], - }, - }, - { - pageContent: "Python data science guide with pandas", - metadata: { - title: "Python DS", - url: "https://example.com/python-ds", - path: ["programming", "python"], - }, - }, - ]; - - await store.addDocuments("testlib", "1.0.0", docs); + // Add two pages separately + await store.addDocuments( + "testlib", + "1.0.0", + 1, + createScrapeResult( + "JS Tutorial", + "https://example.com/js-tutorial", + "JavaScript programming tutorial with examples", + ["programming", "javascript"], + ), + ); + await store.addDocuments( + "testlib", + "1.0.0", + 1, + createScrapeResult( + "Python DS", + "https://example.com/python-ds", + "Python data science guide with pandas", + ["programming", "python"], + ), + ); // Verify documents were stored expect(await store.checkDocumentExists("testlib", "1.0.0")).toBe(true); @@ -136,47 +175,48 @@ describe("DocumentStore - With Embeddings", () => { }); it("should handle document deletion correctly", async () => { - const docs: Document[] = [ - { - pageContent: "Temporary document for deletion test", - metadata: { - title: "Temp Doc", - url: "https://example.com/temp", - path: ["temp"], - }, - }, - ]; - - await store.addDocuments("templib", "1.0.0", docs); + await store.addDocuments( + "templib", + "1.0.0", + 1, + createScrapeResult( + "Temp Doc", + "https://example.com/temp", + "Temporary document for deletion test", + ["temp"], + ), + ); expect(await store.checkDocumentExists("templib", "1.0.0")).toBe(true); - const deletedCount = await store.deleteDocuments("templib", "1.0.0"); + const deletedCount = await store.deletePages("templib", "1.0.0"); expect(deletedCount).toBe(1); expect(await store.checkDocumentExists("templib", "1.0.0")).toBe(false); }); it("should completely remove a version including pages and documents", async () => { - const docs: Document[] = [ - { - pageContent: "First document for removal test", - metadata: { - title: "Doc 1", - url: "https://example.com/doc1", - path: ["docs"], - }, - }, - { - pageContent: "Second document for removal test", - metadata: { - title: "Doc 2", - url: "https://example.com/doc2", - path: ["docs"], - }, - }, - ]; - - // Add documents and verify they exist - await store.addDocuments("removelib", "1.0.0", docs); + // Add two pages + await store.addDocuments( + "removelib", + "1.0.0", + 1, + createScrapeResult( + "Doc 1", + "https://example.com/doc1", + "First document for removal test", + ["docs"], + ), + ); + await store.addDocuments( + "removelib", + "1.0.0", + 1, + createScrapeResult( + "Doc 2", + "https://example.com/doc2", + "Second document for removal test", + ["docs"], + ), + ); expect(await store.checkDocumentExists("removelib", "1.0.0")).toBe(true); // Remove the version @@ -192,31 +232,23 @@ describe("DocumentStore - With Embeddings", () => { }); it("should remove version but keep library when other versions exist", async () => { - const v1Docs: Document[] = [ - { - pageContent: "Version 1 document", - metadata: { - title: "V1 Doc", - url: "https://example.com/v1", - path: ["v1"], - }, - }, - ]; - - const v2Docs: Document[] = [ - { - pageContent: "Version 2 document", - metadata: { - title: "V2 Doc", - url: "https://example.com/v2", - path: ["v2"], - }, - }, - ]; - // Add two versions - await store.addDocuments("multilib", "1.0.0", v1Docs); - await store.addDocuments("multilib", "2.0.0", v2Docs); + await store.addDocuments( + "multilib", + "1.0.0", + 1, + createScrapeResult("V1 Doc", "https://example.com/v1", "Version 1 document", [ + "v1", + ]), + ); + await store.addDocuments( + "multilib", + "2.0.0", + 1, + createScrapeResult("V2 Doc", "https://example.com/v2", "Version 2 document", [ + "v2", + ]), + ); // Remove only version 1.0.0 const result = await store.removeVersion("multilib", "1.0.0", true); @@ -232,30 +264,28 @@ describe("DocumentStore - With Embeddings", () => { }); it("should handle multiple versions of the same library", async () => { - const v1Docs: Document[] = [ - { - pageContent: "Version 1.0 feature documentation", - metadata: { - title: "V1 Features", - url: "https://example.com/v1", - path: ["features"], - }, - }, - ]; - - const v2Docs: Document[] = [ - { - pageContent: "Version 2.0 feature documentation with new capabilities", - metadata: { - title: "V2 Features", - url: "https://example.com/v2", - path: ["features"], - }, - }, - ]; - - await store.addDocuments("versionlib", "1.0.0", v1Docs); - await store.addDocuments("versionlib", "2.0.0", v2Docs); + await store.addDocuments( + "versionlib", + "1.0.0", + 1, + createScrapeResult( + "V1 Features", + "https://example.com/v1", + "Version 1.0 feature documentation", + ["features"], + ), + ); + await store.addDocuments( + "versionlib", + "2.0.0", + 1, + createScrapeResult( + "V2 Features", + "https://example.com/v2", + "Version 2.0 feature documentation with new capabilities", + ["features"], + ), + ); expect(await store.checkDocumentExists("versionlib", "1.0.0")).toBe(true); expect(await store.checkDocumentExists("versionlib", "2.0.0")).toBe(true); @@ -264,41 +294,91 @@ describe("DocumentStore - With Embeddings", () => { expect(versions).toContain("1.0.0"); expect(versions).toContain("2.0.0"); }); + + it("should store and retrieve etag and lastModified metadata", async () => { + const testEtag = '"abc123-def456"'; + const testLastModified = "2023-12-01T10:30:00Z"; + + await store.addDocuments( + "etagtest", + "1.0.0", + 1, + createScrapeResult( + "ETag Test Doc", + "https://example.com/etag-test", + "Test document with etag and lastModified", + ["test"], + { etag: testEtag, lastModified: testLastModified }, + ), + ); + + // Query the database directly to verify the etag and last_modified are stored + // @ts-expect-error Accessing private property for testing + const db = store.db; + const pageResult = db + .prepare(` + SELECT p.etag, p.last_modified + FROM pages p + JOIN versions v ON p.version_id = v.id + JOIN libraries l ON v.library_id = l.id + WHERE l.name = ? AND COALESCE(v.name, '') = ? AND p.url = ? + `) + .get("etagtest", "1.0.0", "https://example.com/etag-test") as + | { + etag: string | null; + last_modified: string | null; + } + | undefined; + + expect(pageResult).toBeDefined(); + expect(pageResult?.etag).toBe(testEtag); + expect(pageResult?.last_modified).toBe(testLastModified); + + // Also verify we can retrieve the document and it contains the metadata + const results = await store.findByContent("etagtest", "1.0.0", "etag", 10); + expect(results.length).toBeGreaterThan(0); + + const doc = results[0]; + expect(doc.url).toBe("https://example.com/etag-test"); + }); }); describe("Hybrid Search with Embeddings", () => { beforeEach(async () => { // Set up test documents with known semantic relationships for ranking tests - const docs: Document[] = [ - { - pageContent: "JavaScript programming tutorial with code examples and functions", - metadata: { - title: "JavaScript Programming Guide", - url: "https://example.com/js-guide", - path: ["programming", "javascript"], - }, - }, - { - pageContent: - "Advanced JavaScript frameworks like React and Vue for building applications", - metadata: { - title: "JavaScript Frameworks", - url: "https://example.com/js-frameworks", - path: ["programming", "javascript", "frameworks"], - }, - }, - { - pageContent: - "Python programming language tutorial for data science and machine learning", - metadata: { - title: "Python Programming", - url: "https://example.com/python-guide", - path: ["programming", "python"], - }, - }, - ]; - - await store.addDocuments("searchtest", "1.0.0", docs); + await store.addDocuments( + "searchtest", + "1.0.0", + 1, + createScrapeResult( + "JavaScript Programming Guide", + "https://example.com/js-guide", + "JavaScript programming tutorial with code examples and functions", + ["programming", "javascript"], + ), + ); + await store.addDocuments( + "searchtest", + "1.0.0", + 1, + createScrapeResult( + "JavaScript Frameworks", + "https://example.com/js-frameworks", + "Advanced JavaScript frameworks like React and Vue for building applications", + ["programming", "javascript", "frameworks"], + ), + ); + await store.addDocuments( + "searchtest", + "1.0.0", + 1, + createScrapeResult( + "Python Programming", + "https://example.com/python-guide", + "Python programming language tutorial for data science and machine learning", + ["programming", "python"], + ), + ); }); it("should perform hybrid search combining vector and FTS", async () => { @@ -313,31 +393,28 @@ describe("DocumentStore - With Embeddings", () => { // JavaScript documents should rank higher than non-JavaScript documents const topResult = results[0]; - expect(topResult.pageContent.toLowerCase()).toContain("javascript"); + expect(topResult.content.toLowerCase()).toContain("javascript"); // Results should have both vector and FTS ranking metadata const hybridResults = results.filter( - (r) => r.metadata.vec_rank !== undefined && r.metadata.fts_rank !== undefined, + (r) => r.vec_rank !== undefined && r.fts_rank !== undefined, ); // At least some results should be hybrid matches if (hybridResults.length > 0) { for (const result of hybridResults) { - expect(result.metadata.vec_rank).toBeGreaterThan(0); - expect(result.metadata.fts_rank).toBeGreaterThan(0); - expect(result.metadata.score).toBeGreaterThan(0); + expect(result.vec_rank).toBeGreaterThan(0); + expect(result.fts_rank).toBeGreaterThan(0); + expect(result.score).toBeGreaterThan(0); } } // All results should have valid scores for (const result of results) { - expect(result.metadata.score).toBeGreaterThan(0); - expect(typeof result.metadata.score).toBe("number"); + expect(result.score).toBeGreaterThan(0); + expect(typeof result.score).toBe("number"); // Results should have either vec_rank, fts_rank, or both - expect( - result.metadata.vec_rank !== undefined || - result.metadata.fts_rank !== undefined, - ).toBe(true); + expect(result.vec_rank !== undefined || result.fts_rank !== undefined).toBe(true); } }); @@ -353,22 +430,22 @@ describe("DocumentStore - With Embeddings", () => { // Should find programming documents const programmingResults = results.filter((r) => - r.pageContent.toLowerCase().includes("programming"), + r.content.toLowerCase().includes("programming"), ); expect(programmingResults.length).toBeGreaterThan(0); // At least some results should have vector ranks (semantic/embedding matching) // If no vector results, it might be because embeddings were disabled in this test run - const vectorResults = results.filter((r) => r.metadata.vec_rank !== undefined); - const ftsResults = results.filter((r) => r.metadata.fts_rank !== undefined); + const vectorResults = results.filter((r) => r.vec_rank !== undefined); + const ftsResults = results.filter((r) => r.fts_rank !== undefined); // Either we have vector results (hybrid search) or FTS results (fallback) expect(vectorResults.length > 0 || ftsResults.length > 0).toBe(true); // All results should have valid scores for (const result of results) { - expect(result.metadata.score).toBeGreaterThan(0); + expect(result.score).toBeGreaterThan(0); } }); }); @@ -386,30 +463,40 @@ describe("DocumentStore - With Embeddings", () => { } }); - it("should batch documents by character size limit", async () => { + it("should successfully embed and store large batches of documents", async () => { // Skip if embeddings are disabled // @ts-expect-error Accessing private property for testing if (!store.embeddings) { return; } - // Create 3 docs that fit 2 per batch by character size - const contentSize = 24000; // 24KB each - const docs: Document[] = Array.from({ length: 3 }, (_, i) => ({ - pageContent: "x".repeat(contentSize), - metadata: { - title: `Doc ${i + 1}`, - url: `https://example.com/doc${i + 1}`, - path: ["section"], - }, - })); + // Add multiple large documents to verify batching works correctly + const docCount = 5; + const contentSize = 15000; // 15KB each - ensures batching behavior + + for (let i = 0; i < docCount; i++) { + await store.addDocuments( + "batchtest", + "1.0.0", + 1, + createScrapeResult( + `Batch Doc ${i + 1}`, + `https://example.com/batch-doc${i + 1}`, + "x".repeat(contentSize), + ["section"], + ), + ); + } + + // Verify all documents were successfully embedded and stored + expect(await store.checkDocumentExists("batchtest", "1.0.0")).toBe(true); - await store.addDocuments("testlib", "1.0.0", docs); + // Verify embedDocuments was called (batching occurred) + expect(mockEmbedDocuments).toHaveBeenCalled(); - // Should create 2 batches - first with 2 docs, second with 1 doc - expect(mockEmbedDocuments).toHaveBeenCalledTimes(2); - expect(mockEmbedDocuments.mock.calls[0][0]).toHaveLength(2); - expect(mockEmbedDocuments.mock.calls[1][0]).toHaveLength(1); + // Verify all documents are searchable (embeddings were applied) + const searchResults = await store.findByContent("batchtest", "1.0.0", "Batch", 10); + expect(searchResults.length).toBe(docCount); }); it("should include proper document headers in embedding text", async () => { @@ -419,18 +506,16 @@ describe("DocumentStore - With Embeddings", () => { return; } - const docs: Document[] = [ - { - pageContent: "Test content", - metadata: { - title: "Test Title", - url: "https://example.com/test", - path: ["path", "to", "doc"], - }, - }, - ]; - - await store.addDocuments("testlib", "1.0.0", docs); + await store.addDocuments( + "testlib", + "1.0.0", + 1, + createScrapeResult("Test Title", "https://example.com/test", "Test content", [ + "path", + "to", + "doc", + ]), + ); // Embedding text should include structured metadata expect(mockEmbedDocuments).toHaveBeenCalledTimes(1); @@ -445,18 +530,17 @@ describe("DocumentStore - With Embeddings", () => { describe("Status Tracking and Metadata", () => { it("should update version status correctly", async () => { - const docs: Document[] = [ - { - pageContent: "Status tracking test content", - metadata: { - title: "Status Test", - url: "https://example.com/status-test", - path: ["test"], - }, - }, - ]; - - await store.addDocuments("statuslib", "1.0.0", docs); + await store.addDocuments( + "statuslib", + "1.0.0", + 1, + createScrapeResult( + "Status Test", + "https://example.com/status-test", + "Status tracking test content", + ["test"], + ), + ); const versionId = await store.resolveVersionId("statuslib", "1.0.0"); await store.updateVersionStatus(versionId, VersionStatus.QUEUED); @@ -530,19 +614,18 @@ describe("DocumentStore - Without Embeddings (FTS-only)", () => { store = new DocumentStore(":memory:"); await store.initialize(); - const testDocuments: Document[] = [ - { - pageContent: "This is a test document about React hooks.", - metadata: { - url: "https://example.com/react-hooks", - title: "React Hooks Guide", - path: ["React", "Hooks"], - }, - }, - ]; - await expect( - store.addDocuments("react", "18.0.0", testDocuments), + store.addDocuments( + "react", + "18.0.0", + 1, + createScrapeResult( + "React Hooks Guide", + "https://example.com/react-hooks", + "This is a test document about React hooks.", + ["React", "Hooks"], + ), + ), ).resolves.not.toThrow(); const exists = await store.checkDocumentExists("react", "18.0.0"); @@ -555,43 +638,45 @@ describe("DocumentStore - Without Embeddings (FTS-only)", () => { store = new DocumentStore(":memory:"); await store.initialize(); - const testDocuments: Document[] = [ - { - pageContent: "React hooks are a powerful feature for state management.", - metadata: { - url: "https://example.com/react-hooks", - title: "React Hooks Guide", - path: ["React", "Hooks"], - }, - }, - { - pageContent: "TypeScript provides excellent type safety for JavaScript.", - metadata: { - url: "https://example.com/typescript-intro", - title: "TypeScript Introduction", - path: ["TypeScript", "Intro"], - }, - }, - ]; - - await store.addDocuments("testlib", "1.0.0", testDocuments); + await store.addDocuments( + "testlib", + "1.0.0", + 1, + createScrapeResult( + "React Hooks Guide", + "https://example.com/react-hooks", + "React hooks are a powerful feature for state management.", + ["React", "Hooks"], + ), + ); + await store.addDocuments( + "testlib", + "1.0.0", + 1, + createScrapeResult( + "TypeScript Introduction", + "https://example.com/typescript-intro", + "TypeScript provides excellent type safety for JavaScript.", + ["TypeScript", "Intro"], + ), + ); }); it("should perform FTS-only search", async () => { const results = await store.findByContent("testlib", "1.0.0", "React hooks", 5); expect(results.length).toBeGreaterThan(0); - expect(results[0].pageContent).toContain("React hooks"); - expect(results[0].metadata).toHaveProperty("score"); - expect(results[0].metadata).toHaveProperty("fts_rank"); + expect(results[0].content).toContain("React hooks"); + expect(results[0]).toHaveProperty("score"); + expect(results[0]).toHaveProperty("fts_rank"); // Should NOT have vector rank since vectorization is disabled - expect(results[0].metadata.vec_rank).toBeUndefined(); + expect((results[0] as any).vec_rank).toBeUndefined(); }); it("should handle various search queries correctly", async () => { const jsResults = await store.findByContent("testlib", "1.0.0", "TypeScript", 5); expect(jsResults.length).toBeGreaterThan(0); - expect(jsResults[0].pageContent).toContain("TypeScript"); + expect(jsResults[0].content).toContain("TypeScript"); // Empty query should return empty results const emptyResults = await store.findByContent("testlib", "1.0.0", "", 5); @@ -657,64 +742,99 @@ describe("DocumentStore - Common Functionality", () => { describe("Version Isolation", () => { it("should search within specific versions only", async () => { - const docsV1: Document[] = [ - { - pageContent: "Old feature documentation", - metadata: { - title: "Old Feature", - url: "https://example.com/old", - path: ["features"], - }, - }, - ]; - - const docsV2: Document[] = [ - { - pageContent: "New feature documentation", - metadata: { - title: "New Feature", - url: "https://example.com/new", - path: ["features"], - }, - }, - ]; - - await store.addDocuments("featuretest", "1.0.0", docsV1); - await store.addDocuments("featuretest", "2.0.0", docsV2); + await store.addDocuments( + "featuretest", + "1.0.0", + 1, + createScrapeResult( + "Old Feature", + "https://example.com/old", + "Old feature documentation", + ["features"], + ), + ); + await store.addDocuments( + "featuretest", + "2.0.0", + 1, + createScrapeResult( + "New Feature", + "https://example.com/new", + "New feature documentation", + ["features"], + ), + ); const v1Results = await store.findByContent("featuretest", "1.0.0", "feature", 10); expect(v1Results.length).toBeGreaterThan(0); - expect(v1Results[0].metadata.title).toBe("Old Feature"); + expect(v1Results[0].title).toBe("Old Feature"); const v2Results = await store.findByContent("featuretest", "2.0.0", "feature", 10); expect(v2Results.length).toBeGreaterThan(0); - expect(v2Results[0].metadata.title).toBe("New Feature"); + expect(v2Results[0].title).toBe("New Feature"); }); }); describe("Document Management", () => { - it("should retrieve documents by ID", async () => { - const docs: Document[] = [ - { - pageContent: "Test document for ID retrieval", - metadata: { - title: "ID Test Doc", - url: "https://example.com/id-test", - path: ["test"], - }, - }, - ]; + it("should delete both documents and pages when removing all documents", async () => { + const library = "delete-test"; + const version = "1.0.0"; - await store.addDocuments("idtest", "1.0.0", docs); + // Add multiple pages with documents + await store.addDocuments( + library, + version, + 1, + createScrapeResult("Page 1", "https://example.com/page1", "Content for page 1", [ + "section1", + ]), + ); + await store.addDocuments( + library, + version, + 1, + createScrapeResult("Page 2", "https://example.com/page2", "Content for page 2", [ + "section2", + ]), + ); + + // Verify both pages and documents exist + const versionId = await store.resolveVersionId(library, version); + const pagesBefore = await store.getPagesByVersionId(versionId); + expect(pagesBefore.length).toBe(2); + expect(await store.checkDocumentExists(library, version)).toBe(true); + + // Delete all documents for this version + const deletedCount = await store.deletePages(library, version); + expect(deletedCount).toBe(2); // Should delete 2 documents + + // Verify both documents AND pages are gone + const pagesAfter = await store.getPagesByVersionId(versionId); + expect(pagesAfter.length).toBe(0); // Pages should be deleted too + expect(await store.checkDocumentExists(library, version)).toBe(false); + }); + + it("should retrieve documents by ID", async () => { + await store.addDocuments( + "idtest", + "1.0.0", + 1, + createScrapeResult( + "ID Test Doc", + "https://example.com/id-test", + "Test document for ID retrieval", + ["test"], + ), + ); const results = await store.findByContent("idtest", "1.0.0", "test document", 10); expect(results.length).toBeGreaterThan(0); const doc = results[0]; - expect(doc.metadata.id).toBeDefined(); + expect(doc.id).toBeDefined(); - const retrievedDoc = await store.getById(doc.metadata.id); + const retrievedDoc = await store.getById(doc.id); expect(retrievedDoc).not.toBeNull(); - expect(retrievedDoc?.metadata.title).toBe("ID Test Doc"); + expect(retrievedDoc?.title).toBe("ID Test Doc"); }); it("should handle URL pre-deletion correctly", async () => { @@ -745,39 +865,50 @@ describe("DocumentStore - Common Functionality", () => { return result.count; } - // Add initial documents - const initialDocs: Document[] = [ - { - pageContent: "Initial content chunk 1", - metadata: { url, title: "Initial Test Page", path: ["section1"] }, - }, - { - pageContent: "Initial content chunk 2", - metadata: { url, title: "Initial Test Page", path: ["section2"] }, - }, - ]; - - await store.addDocuments(library, version, initialDocs); + // Add initial page with 2 chunks + await store.addDocuments(library, version, 1, { + ...createScrapeResult("Initial Test Page", url, "Initial content chunk 1", [ + "section1", + ]), + chunks: [ + { + types: ["text"], + content: "Initial content chunk 1", + section: { level: 0, path: ["section1"] }, + }, + { + types: ["text"], + content: "Initial content chunk 2", + section: { level: 0, path: ["section2"] }, + }, + ], + }); expect(await countDocuments()).toBe(2); expect(await countDocuments(url)).toBe(2); - // Update with new documents (should trigger pre-deletion) - const updatedDocs: Document[] = [ - { - pageContent: "Updated content chunk 1", - metadata: { url, title: "Updated Test Page", path: ["updated-section1"] }, - }, - { - pageContent: "Updated content chunk 2", - metadata: { url, title: "Updated Test Page", path: ["updated-section2"] }, - }, - { - pageContent: "Updated content chunk 3", - metadata: { url, title: "Updated Test Page", path: ["updated-section3"] }, - }, - ]; - - await store.addDocuments(library, version, updatedDocs); + // Update with new page (should trigger pre-deletion) + await store.addDocuments(library, version, 1, { + ...createScrapeResult("Updated Test Page", url, "Updated content chunk 1", [ + "updated-section1", + ]), + chunks: [ + { + types: ["text"], + content: "Updated content chunk 1", + section: { level: 0, path: ["updated-section1"] }, + }, + { + types: ["text"], + content: "Updated content chunk 2", + section: { level: 0, path: ["updated-section2"] }, + }, + { + types: ["text"], + content: "Updated content chunk 3", + section: { level: 0, path: ["updated-section3"] }, + }, + ], + }); expect(await countDocuments()).toBe(3); expect(await countDocuments(url)).toBe(3); }); @@ -785,18 +916,17 @@ describe("DocumentStore - Common Functionality", () => { describe("Search Security", () => { beforeEach(async () => { - const docs: Document[] = [ - { - pageContent: "Programming computers is fun and educational for developers", - metadata: { - title: "Programming Guide", - url: "https://example.com/programming", - path: ["programming", "guide"], - }, - }, - ]; - - await store.addDocuments("security-test", "1.0.0", docs); + await store.addDocuments( + "security-test", + "1.0.0", + 1, + createScrapeResult( + "Programming Guide", + "https://example.com/programming", + "Programming computers is fun and educational for developers", + ["programming", "guide"], + ), + ); }); it("should safely handle malicious queries", async () => { @@ -831,4 +961,91 @@ describe("DocumentStore - Common Functionality", () => { } }); }); + + describe("Refresh Operations - getPagesByVersionId", () => { + beforeEach(async () => { + // Add pages with etags for building refresh queue + await store.addDocuments( + "refresh-queue-test", + "1.0.0", + 1, + createScrapeResult( + "Page 1", + "https://example.com/page1", + "Content 1", + ["section1"], + { etag: '"etag1"', lastModified: "2023-01-01T00:00:00Z" }, + ), + ); + await store.addDocuments( + "refresh-queue-test", + "1.0.0", + 1, + createScrapeResult( + "Page 2", + "https://example.com/page2", + "Content 2", + ["section2"], + { etag: '"etag2"', lastModified: "2023-01-02T00:00:00Z" }, + ), + ); + await store.addDocuments( + "refresh-queue-test", + "1.0.0", + 1, + createScrapeResult( + "Page 3 No ETag", + "https://example.com/page3", + "Content 3", + ["section3"], + { etag: null, lastModified: null }, + ), + ); + }); + + it("should retrieve all pages with metadata for refresh queue building", async () => { + const versionId = await store.resolveVersionId("refresh-queue-test", "1.0.0"); + const pages = await store.getPagesByVersionId(versionId); + + expect(pages.length).toBe(3); + + // Verify page1 metadata + const page1 = pages.find((p) => p.url === "https://example.com/page1"); + expect(page1).toBeDefined(); + expect(page1!.id).toBeDefined(); + expect(page1!.etag).toBe('"etag1"'); + expect(page1!.depth).toBe(1); + + // Verify page2 metadata + const page2 = pages.find((p) => p.url === "https://example.com/page2"); + expect(page2).toBeDefined(); + expect(page2!.etag).toBe('"etag2"'); + + // Verify page3 (no etag) + const page3 = pages.find((p) => p.url === "https://example.com/page3"); + expect(page3).toBeDefined(); + expect(page3!.etag).toBeNull(); + }); + + it("should return empty array for version with no pages", async () => { + const emptyVersionId = await store.resolveVersionId("empty-lib", "1.0.0"); + const pages = await store.getPagesByVersionId(emptyVersionId); + + expect(pages).toEqual([]); + }); + + it("should include all metadata fields needed for refresh", async () => { + const versionId = await store.resolveVersionId("refresh-queue-test", "1.0.0"); + const pages = await store.getPagesByVersionId(versionId); + + // All pages should have the necessary fields for refresh operations + for (const page of pages) { + expect(page.id).toBeDefined(); + expect(page.url).toBeDefined(); + expect(page.depth).toBeDefined(); + // etag can be null, but the field should exist + expect(page).toHaveProperty("etag"); + } + }); + }); }); diff --git a/src/store/DocumentStore.ts b/src/store/DocumentStore.ts index 8a4f9240..a6c0e022 100644 --- a/src/store/DocumentStore.ts +++ b/src/store/DocumentStore.ts @@ -1,10 +1,8 @@ -import type { Document } from "@langchain/core/documents"; import type { Embeddings } from "@langchain/core/embeddings"; import Database, { type Database as DatabaseType } from "better-sqlite3"; import semver from "semver"; import * as sqliteVec from "sqlite-vec"; -import type { ScraperOptions } from "../scraper/types"; -import type { DocumentMetadata } from "../types"; +import type { ScrapeResult, ScraperOptions } from "../scraper/types"; import { EMBEDDING_BATCH_CHARS, EMBEDDING_BATCH_SIZE, @@ -23,22 +21,23 @@ import { UnsupportedProviderError, } from "./embeddings/EmbeddingFactory"; import { ConnectionError, DimensionError, StoreError } from "./errors"; -import type { StoredScraperOptions } from "./types"; +import type { DbChunkMetadata, DbChunkRank, StoredScraperOptions } from "./types"; import { - type DbDocument, - type DbJoinedDocument, + type DbChunk, + type DbLibraryVersion, + type DbPage, + type DbPageChunk, type DbQueryResult, type DbVersion, type DbVersionWithLibrary, denormalizeVersionName, - mapDbDocumentToDocument, normalizeVersionName, VECTOR_DIMENSION, type VersionScraperOptions, type VersionStatus, } from "./types"; -interface RawSearchResult extends DbDocument { +interface RawSearchResult extends DbChunk { // Page fields joined from pages table url?: string; title?: string; @@ -75,11 +74,12 @@ export class DocumentStore { insertEmbedding: Database.Statement<[bigint, string]>; // New statement for pages table insertPage: Database.Statement< - [number, string, string, string | null, string | null, string | null] + [number, string, string, string | null, string | null, string | null, number | null] >; getPageId: Database.Statement<[number, string]>; deleteDocuments: Database.Statement<[string, string]>; - deleteDocumentsByUrl: Database.Statement<[string, string, string]>; + deleteDocumentsByPageId: Database.Statement<[number]>; + deletePage: Database.Statement<[number]>; deletePages: Database.Statement<[string, string]>; queryVersions: Database.Statement<[string]>; checkExists: Database.Statement<[string, string]>; @@ -114,6 +114,7 @@ export class DocumentStore { deleteLibraryById: Database.Statement<[number]>; countVersionsByLibraryId: Database.Statement<[number]>; getVersionId: Database.Statement<[string, string]>; + getPagesByVersionId: Database.Statement<[number]>; }; /** @@ -184,7 +185,7 @@ export class DocumentStore { private prepareStatements(): void { const statements = { getById: this.db.prepare<[bigint]>( - `SELECT d.*, p.url, p.title, p.content_type + `SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d JOIN pages p ON d.page_id = p.id WHERE d.id = ?`, @@ -197,9 +198,17 @@ export class DocumentStore { "UPDATE documents SET embedding = ? WHERE id = ?", ), insertPage: this.db.prepare< - [number, string, string, string | null, string | null, string | null] + [ + number, + string, + string, + string | null, + string | null, + string | null, + number | null, + ] >( - "INSERT INTO pages (version_id, url, title, etag, last_modified, content_type) VALUES (?, ?, ?, ?, ?, ?) ON CONFLICT(version_id, url) DO UPDATE SET title = excluded.title, content_type = excluded.content_type", + "INSERT INTO pages (version_id, url, title, etag, last_modified, content_type, depth) VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(version_id, url) DO UPDATE SET title = excluded.title, content_type = excluded.content_type, etag = excluded.etag, last_modified = excluded.last_modified, depth = excluded.depth", ), getPageId: this.db.prepare<[number, string]>( "SELECT id FROM pages WHERE version_id = ? AND url = ?", @@ -211,11 +220,11 @@ export class DocumentStore { "SELECT id FROM libraries WHERE name = ?", ), // New version-related statements - insertVersion: this.db.prepare<[number, string | null]>( + insertVersion: this.db.prepare<[number, string]>( "INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING", ), - resolveVersionId: this.db.prepare<[number, string | null]>( - "SELECT id FROM versions WHERE library_id = ? AND name IS ?", + resolveVersionId: this.db.prepare<[number, string]>( + "SELECT id FROM versions WHERE library_id = ? AND name = ?", ), getVersionById: this.db.prepare<[number]>("SELECT * FROM versions WHERE id = ?"), queryVersionsByLibraryId: this.db.prepare<[number]>( @@ -230,15 +239,10 @@ export class DocumentStore { WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '') )`, ), - deleteDocumentsByUrl: this.db.prepare<[string, string, string]>( - `DELETE FROM documents - WHERE page_id IN ( - SELECT p.id FROM pages p - JOIN versions v ON p.version_id = v.id - JOIN libraries l ON v.library_id = l.id - WHERE p.url = ? AND l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '') - )`, + deleteDocumentsByPageId: this.db.prepare<[number]>( + "DELETE FROM documents WHERE page_id = ?", ), + deletePage: this.db.prepare<[number]>("DELETE FROM pages WHERE id = ?"), deletePages: this.db.prepare<[string, string]>( `DELETE FROM pages WHERE version_id IN ( @@ -296,7 +300,7 @@ export class DocumentStore { getChildChunks: this.db.prepare< [string, string, string, number, string, bigint, number] >(` - SELECT d.*, p.url, p.title, p.content_type FROM documents d + SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d JOIN pages p ON d.page_id = p.id JOIN versions v ON p.version_id = v.id JOIN libraries l ON v.library_id = l.id @@ -312,7 +316,7 @@ export class DocumentStore { getPrecedingSiblings: this.db.prepare< [string, string, string, bigint, string, number] >(` - SELECT d.*, p.url, p.title, p.content_type FROM documents d + SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d JOIN pages p ON d.page_id = p.id JOIN versions v ON p.version_id = v.id JOIN libraries l ON v.library_id = l.id @@ -327,7 +331,7 @@ export class DocumentStore { getSubsequentSiblings: this.db.prepare< [string, string, string, bigint, string, number] >(` - SELECT d.*, p.url, p.title, p.content_type FROM documents d + SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d JOIN pages p ON d.page_id = p.id JOIN versions v ON p.version_id = v.id JOIN libraries l ON v.library_id = l.id @@ -340,7 +344,7 @@ export class DocumentStore { LIMIT ? `), getParentChunk: this.db.prepare<[string, string, string, string, bigint]>(` - SELECT d.*, p.url, p.title, p.content_type FROM documents d + SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d JOIN pages p ON d.page_id = p.id JOIN versions v ON p.version_id = v.id JOIN libraries l ON v.library_id = l.id @@ -383,6 +387,9 @@ export class DocumentStore { JOIN libraries l ON v.library_id = l.id WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')`, ), + getPagesByVersionId: this.db.prepare<[number]>( + "SELECT * FROM pages WHERE version_id = ?", + ), }; this.statements = statements; } @@ -587,7 +594,7 @@ export class DocumentStore { this.statements.insertVersion.run(libraryId, normalizedVersion); const versionIdRow = this.statements.resolveVersionId.get( libraryId, - normalizedVersion === null ? "" : normalizedVersion, + normalizedVersion, ) as { id: number } | undefined; if (!versionIdRow || typeof versionIdRow.id !== "number") { throw new StoreError( @@ -672,8 +679,16 @@ export class DocumentStore { */ async storeScraperOptions(versionId: number, options: ScraperOptions): Promise { try { - // biome-ignore lint/correctness/noUnusedVariables: Extract source URL and exclude runtime-only fields using destructuring - const { url: source_url, library, version, signal, ...scraper_options } = options; + // Extract source URL and exclude runtime-only fields using destructuring + const { + url: source_url, + library: _library, + version: _version, + signal: _signal, + initialQueue: _initialQueue, + isRefresh: _isRefresh, + ...scraper_options + } = options; const optionsJson = JSON.stringify(scraper_options); this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId); @@ -765,21 +780,7 @@ export class DocumentStore { > > { try { - // Define the expected row structure from the GROUP BY query (including versions without documents) - interface LibraryVersionRow { - library: string; - version: string; - versionId: number; - status: VersionStatus; - progressPages: number; - progressMaxPages: number; - sourceUrl: string | null; - documentCount: number; - uniqueUrlCount: number; - indexedAt: string | null; // MIN() may return null - } - - const rows = this.statements.queryLibraryVersions.all() as LibraryVersionRow[]; + const rows = this.statements.queryLibraryVersions.all() as DbLibraryVersion[]; const libraryMap = new Map< string, Array<{ @@ -855,34 +856,22 @@ export class DocumentStore { async addDocuments( library: string, version: string, - documents: Document[], + depth: number, + result: ScrapeResult, ): Promise { try { - if (documents.length === 0) { + const { title, url, chunks } = result; + if (chunks.length === 0) { return; } - // Group documents by URL to create pages - const documentsByUrl = new Map(); - for (const doc of documents) { - const url = doc.metadata.url as string; - if (!url || typeof url !== "string" || !url.trim()) { - throw new StoreError("Document metadata must include a valid URL"); - } - - if (!documentsByUrl.has(url)) { - documentsByUrl.set(url, []); - } - documentsByUrl.get(url)?.push(doc); - } - // Generate embeddings in batch only if vector search is enabled let paddedEmbeddings: number[][] = []; if (this.isVectorSearchEnabled) { - const texts = documents.map((doc) => { - const header = `${doc.metadata.title}\n${doc.metadata.url}\n${(doc.metadata.path || []).join(" / ")}\n`; - return `${header}${doc.pageContent}`; + const texts = chunks.map((chunk) => { + const header = `${title}\n${url}\n${(chunk.section.path || []).join(" / ")}\n`; + return `${header}${chunk.content}`; }); // Batch embedding creation to avoid token limit errors @@ -940,106 +929,104 @@ export class DocumentStore { // Resolve library and version IDs (creates them if they don't exist) const versionId = await this.resolveVersionId(library, version); - // Delete existing documents for these URLs to prevent conflicts - for (const url of documentsByUrl.keys()) { - const deletedCount = await this.deleteDocumentsByUrl(library, version, url); - if (deletedCount > 0) { - logger.debug(`Deleted ${deletedCount} existing documents for URL: ${url}`); + // Delete existing documents for this page to prevent conflicts + // First check if the page exists and get its ID + const existingPage = this.statements.getPageId.get(versionId, url) as + | { id: number } + | undefined; + + if (existingPage) { + const result = this.statements.deleteDocumentsByPageId.run(existingPage.id); + if (result.changes > 0) { + logger.debug(`Deleted ${result.changes} existing documents for URL: ${url}`); } } // Insert documents in a transaction - const transaction = this.db.transaction((docsByUrl: Map) => { - // First, create or update pages for each unique URL - const pageIds = new Map(); - - for (const [url, urlDocs] of docsByUrl) { - // Use the first document's metadata for page-level data - const firstDoc = urlDocs[0]; - const title = firstDoc.metadata.title || ""; - // Extract content type from metadata if available - const contentType = firstDoc.metadata.contentType || null; - - // Insert or update page record - this.statements.insertPage.run( - versionId, - url, - title, - null, // etag - will be populated during scraping - null, // last_modified - will be populated during scraping - contentType, - ); - - // Query for the page ID since we can't use RETURNING - const existingPage = this.statements.getPageId.get(versionId, url) as - | { id: number } - | undefined; - if (!existingPage) { - throw new StoreError(`Failed to get page ID for URL: ${url}`); - } - const pageId = existingPage.id; - pageIds.set(url, pageId); + const transaction = this.db.transaction(() => { + // Extract content type from metadata if available + const contentType = result.contentType || null; + + // Extract etag from document metadata if available + const etag = result.etag || null; + + // Extract lastModified from document metadata if available + const lastModified = result.lastModified || null; + + // Insert or update page record + this.statements.insertPage.run( + versionId, + url, + title || "", + etag, + lastModified, + contentType, + depth, + ); + + // Query for the page ID since we can't use RETURNING + const existingPage = this.statements.getPageId.get(versionId, url) as + | { id: number } + | undefined; + if (!existingPage) { + throw new StoreError(`Failed to get page ID for URL: ${url}`); } + const pageId = existingPage.id; // Then insert document chunks linked to their pages let docIndex = 0; - for (const [url, urlDocs] of docsByUrl) { - const pageId = pageIds.get(url); - if (!pageId) { - throw new StoreError(`Failed to get page ID for URL: ${url}`); - } + for (let i = 0; i < chunks.length; i++) { + const chunk = chunks[i]; + + // Insert document chunk + const result = this.statements.insertDocument.run( + pageId, + chunk.content, + JSON.stringify({ + types: chunk.types, + level: chunk.section.level, + path: chunk.section.path, + } satisfies DbChunkMetadata), + i, // sort_order within this page + ); + const rowId = result.lastInsertRowid; - for (let i = 0; i < urlDocs.length; i++) { - const doc = urlDocs[i]; - - // Create chunk-specific metadata (remove page-level fields) - const { - url: _, - title: __, - library: ___, - version: ____, - ...chunkMetadata - } = doc.metadata; - - // Insert document chunk - const result = this.statements.insertDocument.run( - pageId, - doc.pageContent, - JSON.stringify(chunkMetadata), - i, // sort_order within this page + // Insert into vector table only if vector search is enabled + if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) { + this.statements.insertEmbedding.run( + BigInt(rowId), + JSON.stringify(paddedEmbeddings[docIndex]), ); - const rowId = result.lastInsertRowid; - - // Insert into vector table only if vector search is enabled - if (this.isVectorSearchEnabled && paddedEmbeddings.length > 0) { - this.statements.insertEmbedding.run( - BigInt(rowId), - JSON.stringify(paddedEmbeddings[docIndex]), - ); - } - - docIndex++; } + + docIndex++; } }); - transaction(documentsByUrl); + transaction(); } catch (error) { throw new ConnectionError("Failed to add documents to store", error); } } /** - * Removes documents matching specified library and version + * Removes documents and pages matching specified library and version. + * This consolidated method deletes both documents and their associated pages. * @returns Number of documents deleted */ - async deleteDocuments(library: string, version: string): Promise { + async deletePages(library: string, version: string): Promise { try { const normalizedVersion = version.toLowerCase(); + + // First delete documents const result = this.statements.deleteDocuments.run( library.toLowerCase(), normalizedVersion, ); + + // Then delete the pages (after documents are gone, due to foreign key constraints) + this.statements.deletePages.run(library.toLowerCase(), normalizedVersion); + return result.changes; } catch (error) { throw new ConnectionError("Failed to delete documents", error); @@ -1047,24 +1034,40 @@ export class DocumentStore { } /** - * Removes documents for a specific URL within a library and version - * @returns Number of documents deleted + * Deletes a page and all its associated document chunks. + * Performs manual deletion in the correct order to satisfy foreign key constraints: + * 1. Delete document chunks (page_id references pages.id) + * 2. Delete page record + * + * This method is used during refresh operations when a page returns 404 Not Found. */ - async deleteDocumentsByUrl( - library: string, - version: string, - url: string, - ): Promise { + async deletePage(pageId: number): Promise { try { - const normalizedVersion = version.toLowerCase(); - const result = this.statements.deleteDocumentsByUrl.run( - url, - library.toLowerCase(), - normalizedVersion, - ); - return result.changes; + // Delete documents first (due to foreign key constraint) + const docResult = this.statements.deleteDocumentsByPageId.run(pageId); + logger.debug(`Deleted ${docResult.changes} document(s) for page ID ${pageId}`); + + // Then delete the page record + const pageResult = this.statements.deletePage.run(pageId); + if (pageResult.changes > 0) { + logger.debug(`Deleted page record for page ID ${pageId}`); + } } catch (error) { - throw new ConnectionError("Failed to delete documents by URL", error); + throw new ConnectionError(`Failed to delete page ${pageId}`, error); + } + } + + /** + * Retrieves all pages for a specific version ID with their metadata. + * Used for refresh operations to get existing pages with their ETags and depths. + * @returns Array of page records + */ + async getPagesByVersionId(versionId: number): Promise { + try { + const result = this.statements.getPagesByVersionId.all(versionId) as DbPage[]; + return result; + } catch (error) { + throw new ConnectionError("Failed to get pages by version ID", error); } } @@ -1109,7 +1112,7 @@ export class DocumentStore { // 4. libraries (if empty) // Delete all documents for this version - const documentsDeleted = await this.deleteDocuments(library, version); + const documentsDeleted = await this.deletePages(library, version); // Delete all pages for this version (must be done after documents, before version) this.statements.deletePages.run(normalizedLibrary, normalizedVersion); @@ -1141,21 +1144,42 @@ export class DocumentStore { } } + /** + * Parses the metadata field from a JSON string to an object. + * This is necessary because better-sqlite3's json() function returns a string, not an object. + */ + private parseMetadata(row: T): T { + if (row.metadata && typeof row.metadata === "string") { + try { + row.metadata = JSON.parse(row.metadata); + } catch (error) { + logger.warn(`Failed to parse metadata JSON: ${error}`); + row.metadata = {} as M; + } + } + return row; + } + + /** + * Parses metadata for an array of rows. + */ + private parseMetadataArray(rows: T[]): T[] { + return rows.map((row) => this.parseMetadata(row)); + } + /** * Retrieves a document by its ID. * @param id The ID of the document. * @returns The document, or null if not found. */ - async getById(id: string): Promise { + async getById(id: string): Promise { try { - const row = this.statements.getById.get( - BigInt(id), - ) as DbQueryResult; + const row = this.statements.getById.get(BigInt(id)) as DbQueryResult; if (!row) { return null; } - return mapDbDocumentToDocument(row); + return this.parseMetadata(row); } catch (error) { throw new ConnectionError(`Failed to get document by ID ${id}`, error); } @@ -1171,7 +1195,7 @@ export class DocumentStore { version: string, query: string, limit: number, - ): Promise { + ): Promise<(DbPageChunk & DbChunkRank)[]> { try { // Return empty array for empty or whitespace-only queries if (!query || typeof query !== "string" || query.trim().length === 0) { @@ -1262,25 +1286,20 @@ export class DocumentStore { .sort((a, b) => b.rrf_score - a.rrf_score) .slice(0, limit); - return topResults.map((row) => ({ - ...mapDbDocumentToDocument({ + return topResults.map((row) => { + const result: DbPageChunk = { ...row, url: row.url || "", // Ensure url is never undefined - title: row.title, - content_type: row.content_type, - } as DbJoinedDocument), - metadata: { - ...JSON.parse(row.metadata), - id: row.id, + title: row.title || null, + content_type: row.content_type || null, + }; + // Add search scores as additional properties (not in metadata) + return Object.assign(result, { score: row.rrf_score, vec_rank: row.vec_rank, fts_rank: row.fts_rank, - // Explicitly add page fields if they exist - url: row.url || "", - title: row.title || "", - ...(row.content_type && { contentType: row.content_type }), - }, - })); + }); + }); } else { // Fallback: full-text search only const stmt = this.db.prepare(` @@ -1316,25 +1335,19 @@ export class DocumentStore { ) as (RawSearchResult & { fts_score: number })[]; // Assign FTS ranks based on order (best score = rank 1) - return rawResults.map((row, index) => ({ - ...mapDbDocumentToDocument({ + return rawResults.map((row, index) => { + const result: DbPageChunk = { ...row, url: row.url || "", // Ensure url is never undefined - title: row.title, - content_type: row.content_type, - } as DbJoinedDocument), - metadata: { - ...JSON.parse(row.metadata), - id: row.id, + title: row.title || null, + content_type: row.content_type || null, + }; + // Add search scores as additional properties (not in metadata) + return Object.assign(result, { score: -row.fts_score, // Convert BM25 score to positive value for consistency fts_rank: index + 1, // Assign rank based on order (1-based) - // Explicitly ensure vec_rank is not included in FTS-only mode - // Explicitly add page fields - url: row.url || "", - title: row.title || "", - ...(row.content_type && { contentType: row.content_type }), - }, - })); + }); + }); } } catch (error) { throw new ConnectionError( @@ -1352,28 +1365,27 @@ export class DocumentStore { version: string, id: string, limit: number, - ): Promise { + ): Promise { try { const parent = await this.getById(id); if (!parent) { return []; } - const parentPath = (parent.metadata as DocumentMetadata).path ?? []; - const parentUrl = (parent.metadata as DocumentMetadata).url; + const parentPath = parent.metadata.path ?? []; const normalizedVersion = version.toLowerCase(); const result = this.statements.getChildChunks.all( library.toLowerCase(), normalizedVersion, - parentUrl, + parent.url, parentPath.length + 1, JSON.stringify(parentPath), BigInt(id), limit, - ) as Array; + ) as Array; - return result.map((row) => mapDbDocumentToDocument(row)); + return this.parseMetadataArray(result); } catch (error) { throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error); } @@ -1387,26 +1399,25 @@ export class DocumentStore { version: string, id: string, limit: number, - ): Promise { + ): Promise { try { const reference = await this.getById(id); if (!reference) { return []; } - const refMetadata = reference.metadata as DocumentMetadata; const normalizedVersion = version.toLowerCase(); const result = this.statements.getPrecedingSiblings.all( library.toLowerCase(), normalizedVersion, - refMetadata.url, + reference.url, BigInt(id), - JSON.stringify(refMetadata.path), + JSON.stringify(reference.metadata.path), limit, - ) as Array; + ) as Array; - return result.reverse().map((row) => mapDbDocumentToDocument(row)); + return this.parseMetadataArray(result).reverse(); } catch (error) { throw new ConnectionError( `Failed to find preceding sibling chunks for ID ${id}`, @@ -1423,26 +1434,25 @@ export class DocumentStore { version: string, id: string, limit: number, - ): Promise { + ): Promise { try { const reference = await this.getById(id); if (!reference) { return []; } - const refMetadata = reference.metadata; const normalizedVersion = version.toLowerCase(); const result = this.statements.getSubsequentSiblings.all( library.toLowerCase(), normalizedVersion, - refMetadata.url, + reference.url, BigInt(id), - JSON.stringify(refMetadata.path), + JSON.stringify(reference.metadata.path), limit, - ) as Array; + ) as Array; - return result.map((row) => mapDbDocumentToDocument(row)); + return this.parseMetadataArray(result); } catch (error) { throw new ConnectionError( `Failed to find subsequent sibling chunks for ID ${id}`, @@ -1453,20 +1463,21 @@ export class DocumentStore { /** * Finds the parent chunk of a given document. + * Returns null if no parent is found or if there's a database error. + * Database errors are logged but not thrown to maintain consistent behavior. */ async findParentChunk( library: string, version: string, id: string, - ): Promise { + ): Promise { try { const child = await this.getById(id); if (!child) { return null; } - const childMetadata = child.metadata as DocumentMetadata; - const path = childMetadata.path ?? []; + const path = child.metadata.path ?? []; const parentPath = path.slice(0, -1); if (parentPath.length === 0) { @@ -1477,37 +1488,38 @@ export class DocumentStore { const result = this.statements.getParentChunk.get( library.toLowerCase(), normalizedVersion, - childMetadata.url, + child.url, JSON.stringify(parentPath), BigInt(id), - ) as DbQueryResult; + ) as DbQueryResult; if (!result) { return null; } - return mapDbDocumentToDocument(result); + return this.parseMetadata(result); } catch (error) { - throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error); + logger.warn(`Failed to find parent chunk for ID ${id}: ${error}`); + return null; } } /** * Fetches multiple documents by their IDs in a single call. - * Returns an array of Document objects, sorted by their sort_order. + * Returns an array of DbPageChunk objects, sorted by their sort_order. */ async findChunksByIds( library: string, version: string, ids: string[], - ): Promise { + ): Promise { if (!ids.length) return []; try { const normalizedVersion = version.toLowerCase(); // Use parameterized query for variable number of IDs const placeholders = ids.map(() => "?").join(","); const stmt = this.db.prepare( - `SELECT d.*, p.url, p.title, p.content_type FROM documents d + `SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d JOIN pages p ON d.page_id = p.id JOIN versions v ON p.version_id = v.id JOIN libraries l ON v.library_id = l.id @@ -1520,8 +1532,8 @@ export class DocumentStore { library.toLowerCase(), normalizedVersion, ...ids, - ) as DbJoinedDocument[]; - return rows.map((row) => mapDbDocumentToDocument(row)); + ) as DbPageChunk[]; + return this.parseMetadataArray(rows); } catch (error) { throw new ConnectionError("Failed to fetch documents by IDs", error); } @@ -1529,17 +1541,17 @@ export class DocumentStore { /** * Fetches all document chunks for a specific URL within a library and version. - * Returns documents sorted by their sort_order for proper reassembly. + * Returns DbPageChunk objects sorted by their sort_order for proper reassembly. */ async findChunksByUrl( library: string, version: string, url: string, - ): Promise { + ): Promise { try { const normalizedVersion = version.toLowerCase(); const stmt = this.db.prepare( - `SELECT d.*, p.url, p.title, p.content_type FROM documents d + `SELECT d.id, d.page_id, d.content, json(d.metadata) as metadata, d.sort_order, d.embedding, d.created_at, p.url, p.title, p.content_type FROM documents d JOIN pages p ON d.page_id = p.id JOIN versions v ON p.version_id = v.id JOIN libraries l ON v.library_id = l.id @@ -1552,8 +1564,8 @@ export class DocumentStore { library.toLowerCase(), normalizedVersion, url, - ) as DbJoinedDocument[]; - return rows.map((row) => mapDbDocumentToDocument(row)); + ) as DbPageChunk[]; + return this.parseMetadataArray(rows); } catch (error) { throw new ConnectionError(`Failed to fetch documents by URL ${url}`, error); } diff --git a/src/store/assembly/ContentAssemblyStrategyFactory.ts b/src/store/assembly/ContentAssemblyStrategyFactory.ts index 5235a1ee..c49f88c2 100644 --- a/src/store/assembly/ContentAssemblyStrategyFactory.ts +++ b/src/store/assembly/ContentAssemblyStrategyFactory.ts @@ -9,7 +9,7 @@ import type { ContentAssemblyStrategy } from "./types"; * @returns The appropriate strategy instance */ export function createContentAssemblyStrategy( - mimeType?: string, + mimeType?: string | null, ): ContentAssemblyStrategy { // Default to MarkdownAssemblyStrategy for unknown or missing MIME types if (!mimeType) { diff --git a/src/store/assembly/strategies/HierarchicalAssemblyStrategy.test.ts b/src/store/assembly/strategies/HierarchicalAssemblyStrategy.test.ts index fdbd957a..a76c3678 100644 --- a/src/store/assembly/strategies/HierarchicalAssemblyStrategy.test.ts +++ b/src/store/assembly/strategies/HierarchicalAssemblyStrategy.test.ts @@ -1,6 +1,6 @@ -import type { Document } from "@langchain/core/documents"; import { afterEach, beforeEach, describe, expect, it } from "vitest"; import { DocumentStore } from "../../DocumentStore"; +import type { DbChunkMetadata, DbPageChunk } from "../../types"; import { HierarchicalAssemblyStrategy } from "./HierarchicalAssemblyStrategy"; describe("HierarchicalAssemblyStrategy", () => { @@ -46,72 +46,56 @@ describe("HierarchicalAssemblyStrategy", () => { }); it("should reconstruct complete hierarchy for single match", async () => { - const versionId = await documentStore.resolveVersionId("test-hierarchy", "1.0"); - - expect(versionId).toBeGreaterThan(0); + // Use the public API to add documents + await documentStore.addDocuments("test-hierarchy", "1.0", 0, { + url: "Deep.ts", + title: "Deep TypeScript File", + contentType: "text/typescript", + textContent: "", + chunks: [ + { + content: "namespace UserManagement {", + section: { + path: ["UserManagement"], + level: 0, + }, + types: ["structural"], + }, + { + content: " export class UserService {", + section: { + path: ["UserManagement", "UserService"], + level: 1, + }, + types: ["structural"], + }, + { + content: " getUserById(id: string) { return db.find(id); }", + section: { + path: ["UserManagement", "UserService", "getUserById"], + level: 2, + }, + types: ["text"], + }, + ], + links: [], + errors: [], + }); - // Create a page first - const pageResult = (documentStore as any).statements.insertPage.run( - versionId, + // Query the database to get the actual document IDs + const allChunks = await documentStore.findChunksByUrl( + "test-hierarchy", + "1.0", "Deep.ts", - "Deep TypeScript File", - null, - null, - "text/typescript", ); - const pageId = pageResult.lastInsertRowid; + expect(allChunks.length).toBe(3); - // Create a hierarchy: namespace > class > method - const namespaceResult = (documentStore as any).statements.insertDocument.run( - pageId, - "namespace UserManagement {", - JSON.stringify({ - url: "Deep.ts", - path: ["UserManagement"], - level: 0, - types: ["structural"], - }), - 0, - ); - const namespaceId = namespaceResult.lastInsertRowid; - - const classResult = (documentStore as any).statements.insertDocument.run( - pageId, - " export class UserService {", - JSON.stringify({ - url: "Deep.ts", - path: ["UserManagement", "UserService"], - level: 1, - types: ["structural"], - }), - 1, - ); - const classId = classResult.lastInsertRowid; - - const methodResult = (documentStore as any).statements.insertDocument.run( - pageId, - " getUserById(id: string) { return db.find(id); }", - JSON.stringify({ - url: "Deep.ts", - path: ["UserManagement", "UserService", "getUserById"], - level: 2, - types: ["content"], - }), - 2, - ); - const methodId = methodResult.lastInsertRowid; + const namespaceId = allChunks[0].id; + const classId = allChunks[1].id; + const methodId = allChunks[2].id; // Input: just the deeply nested method - const inputDoc: Document = { - id: methodId, - pageContent: " getUserById(id: string) { return db.find(id); }", - metadata: { - url: "Deep.ts", - path: ["UserManagement", "UserService", "getUserById"], - level: 2, - types: ["content"], - }, - }; + const inputDoc = allChunks[2]; const result = await strategy.selectChunks( "test-hierarchy", @@ -120,7 +104,7 @@ describe("HierarchicalAssemblyStrategy", () => { documentStore, ); - const resultContent = result.map((doc) => doc.pageContent); + const resultContent = result.map((doc) => doc.content); const resultIds = result.map((doc) => doc.id); // Should include the complete hierarchy: method + class + namespace @@ -138,62 +122,49 @@ describe("HierarchicalAssemblyStrategy", () => { }); it("should handle hierarchical gaps in parent chain", async () => { - const versionId = await documentStore.resolveVersionId("test-gaps", "1.0"); - - expect(versionId).toBeGreaterThan(0); + // Use the public API to add documents with a gap in the hierarchy + await documentStore.addDocuments("test-gaps", "1.0", 0, { + url: "GapTest.ts", + title: "Gap Test TypeScript File", + contentType: "text/typescript", + textContent: "", + chunks: [ + { + content: "namespace UserManagement {", + section: { + path: ["UserManagement"], + level: 0, + }, + types: ["structural"], + }, + // Intermediate class is missing (gap in hierarchy) + // No chunk with path: ["UserManagement", "UserService"] + { + content: " getUserById(id: string) { return db.find(id); }", + section: { + path: ["UserManagement", "UserService", "getUserById"], + level: 2, + }, + types: ["text"], + }, + ], + links: [], + errors: [], + }); - // Create a page first - const pageResult = (documentStore as any).statements.insertPage.run( - versionId, + // Query the database to get the actual document IDs + const allChunks = await documentStore.findChunksByUrl( + "test-gaps", + "1.0", "GapTest.ts", - "Gap Test TypeScript File", - null, - null, - "text/typescript", ); - const pageId = pageResult.lastInsertRowid; + expect(allChunks.length).toBe(2); - // Root namespace - exists - const namespaceResult = (documentStore as any).statements.insertDocument.run( - pageId, - "namespace UserManagement {", - JSON.stringify({ - url: "GapTest.ts", - path: ["UserManagement"], - level: 0, - types: ["structural"], - }), - 0, - ); - const namespaceId = namespaceResult.lastInsertRowid; + const namespaceId = allChunks[0].id; + const methodId = allChunks[1].id; - // Intermediate class - missing (gap in hierarchy) - // No chunk with path: ["UserManagement", "UserService"] - - // Deep method with missing intermediate parent - const methodResult = (documentStore as any).statements.insertDocument.run( - pageId, - " getUserById(id: string) { return db.find(id); }", - JSON.stringify({ - url: "GapTest.ts", - path: ["UserManagement", "UserService", "getUserById"], - level: 2, - types: ["content"], - }), - 1, - ); - const methodId = methodResult.lastInsertRowid; - - const inputDoc: Document = { - id: methodId, - pageContent: " getUserById(id: string) { return db.find(id); }", - metadata: { - url: "GapTest.ts", - path: ["UserManagement", "UserService", "getUserById"], - level: 2, - types: ["content"], - }, - }; + // Input: just the deeply nested method (with missing intermediate parent) + const inputDoc = allChunks[1]; const result = await strategy.selectChunks( "test-gaps", @@ -202,7 +173,7 @@ describe("HierarchicalAssemblyStrategy", () => { documentStore, ); - const resultContent = result.map((doc) => doc.pageContent); + const resultContent = result.map((doc) => doc.content); const resultIds = result.map((doc) => doc.id); // Should include the matched method and find the root namespace despite the gap @@ -216,61 +187,48 @@ describe("HierarchicalAssemblyStrategy", () => { }); it("should promote deeply nested anonymous functions to their top-level container", async () => { - const versionId = await documentStore.resolveVersionId("test-promotion", "1.0"); - - expect(versionId).toBeGreaterThan(0); + // Use the public API to add documents with nested anonymous function + await documentStore.addDocuments("test-promotion", "1.0", 0, { + url: "applyMigrations.ts", + title: "Apply Migrations TypeScript File", + contentType: "text/typescript", + textContent: "", + chunks: [ + { + content: + "export async function applyMigrations(db: Database): Promise {\n const overallTransaction = db.transaction(() => {\n console.log('migrating');\n });\n}", + section: { + path: ["applyMigrations"], + level: 1, + }, + types: ["code"], + }, + { + content: " console.log('migrating');", + section: { + path: ["applyMigrations", ""], + level: 2, + }, + types: ["code"], + }, + ], + links: [], + errors: [], + }); - // Create a page first - const pageResult = (documentStore as any).statements.insertPage.run( - versionId, + // Query the database to get the actual document IDs + const allChunks = await documentStore.findChunksByUrl( + "test-promotion", + "1.0", "applyMigrations.ts", - "Apply Migrations TypeScript File", - null, - null, - "text/typescript", ); - const pageId = pageResult.lastInsertRowid; + expect(allChunks.length).toBe(2); - // Create a simpler, more realistic scenario that matches how the splitter actually works - // Function containing nested arrow function - const topFunctionResult = (documentStore as any).statements.insertDocument.run( - pageId, - "export async function applyMigrations(db: Database): Promise {\n const overallTransaction = db.transaction(() => {\n console.log('migrating');\n });\n}", - JSON.stringify({ - url: "applyMigrations.ts", - path: ["applyMigrations"], - level: 1, - types: ["code", "content"], - }), - 0, - ); - const topFunctionId = topFunctionResult.lastInsertRowid; - - // Nested arrow function inside the main function - const nestedArrowResult = (documentStore as any).statements.insertDocument.run( - pageId, - " console.log('migrating');", - JSON.stringify({ - url: "applyMigrations.ts", - path: ["applyMigrations", ""], - level: 2, - types: ["code", "content"], - }), - 1, - ); - const nestedArrowId = nestedArrowResult.lastInsertRowid; + const topFunctionId = allChunks[0].id; + const nestedArrowId = allChunks[1].id; // Input: search hit on the nested anonymous arrow function - const inputDoc: Document = { - id: nestedArrowId, - pageContent: " console.log('migrating');", - metadata: { - url: "applyMigrations.ts", - path: ["applyMigrations", ""], - level: 2, - types: ["code", "content"], - }, - }; + const inputDoc = allChunks[1]; const result = await strategy.selectChunks( "test-promotion", @@ -279,7 +237,7 @@ describe("HierarchicalAssemblyStrategy", () => { documentStore, ); - const _resultContent = result.map((doc) => doc.pageContent); + const _resultContent = result.map((doc) => doc.content); const resultIds = result.map((doc) => doc.id); // Should promote to include the entire top-level function that contains the anonymous function @@ -297,95 +255,111 @@ describe("HierarchicalAssemblyStrategy", () => { expect(versionId).toBeGreaterThan(0); // Create a page first - const pageResult = (documentStore as any).statements.insertPage.run( + // @ts-expect-error Accessing private property for testing + const pageResult = documentStore.statements.insertPage.run( versionId, "UserService.ts", "User Service TypeScript File", null, null, "text/typescript", + 0, // depth ); - const pageId = pageResult.lastInsertRowid; + const pageId = pageResult.lastInsertRowid as number; // Class with multiple methods - only some will be matched - const _classOpenResult = (documentStore as any).statements.insertDocument.run( + // @ts-expect-error Accessing private property for testing + const _classOpenResult = documentStore.statements.insertDocument.run( pageId, "class UserService {", JSON.stringify({ - url: "UserService.ts", path: ["UserService", "opening"], level: 1, - }), + } satisfies DbChunkMetadata), 0, ); // Method 1: getUser (will be matched) - const getUserResult = (documentStore as any).statements.insertDocument.run( + // @ts-expect-error Accessing private property for testing + const getUserResult = documentStore.statements.insertDocument.run( pageId, " getUser(id) { return db.find(id); }", JSON.stringify({ - url: "UserService.ts", path: ["UserService", "opening", "getUser"], level: 2, - }), + } satisfies DbChunkMetadata), 1, ); const getUserId = getUserResult.lastInsertRowid.toString(); // Method 2: createUser (will NOT be matched) - (documentStore as any).statements.insertDocument.run( + // @ts-expect-error Accessing private property for testing + documentStore.statements.insertDocument.run( pageId, " createUser(data) { return db.create(data); }", JSON.stringify({ - url: "UserService.ts", path: ["UserService", "opening", "createUser"], level: 2, - }), + } satisfies DbChunkMetadata), 2, ); // Method 3: deleteUser (will be matched) - const deleteUserResult = (documentStore as any).statements.insertDocument.run( + // @ts-expect-error Accessing private property for testing + const deleteUserResult = documentStore.statements.insertDocument.run( pageId, " deleteUser(id) { return db.delete(id); }", JSON.stringify({ - url: "UserService.ts", path: ["UserService", "opening", "deleteUser"], level: 2, - }), + } satisfies DbChunkMetadata), 3, ); const deleteUserId = deleteUserResult.lastInsertRowid.toString(); - const inputDocs: Document[] = [ + const inputDocs: DbPageChunk[] = [ { id: getUserId, - pageContent: " getUser(id) { return db.find(id); }", + page_id: pageId, + url: "UserService.ts", + title: "User Service TypeScript File", + content_type: "text/typescript", + content: " getUser(id) { return db.find(id); }", metadata: { - url: "UserService.ts", path: ["UserService", "getUser"], level: 2, }, + sort_order: 1, + embedding: null, + created_at: new Date().toISOString(), + score: null, }, { id: deleteUserId, - pageContent: " deleteUser(id) { return db.delete(id); }", + page_id: pageId, + url: "UserService.ts", + title: "User Service TypeScript File", + content_type: "text/typescript", + content: " deleteUser(id) { return db.delete(id); }", metadata: { - url: "UserService.ts", path: ["UserService", "deleteUser"], level: 2, }, + sort_order: 3, + embedding: null, + created_at: new Date().toISOString(), + score: null, }, ]; const result = await strategy.selectChunks( "test-multi", "1.0", - inputDocs, + inputDocs as DbPageChunk[], documentStore, ); - const content = result.map((doc) => doc.pageContent); + const content = result.map((doc) => doc.content); // Should include both matched methods expect(content).toContain(" getUser(id) { return db.find(id); }"); @@ -401,70 +375,88 @@ describe("HierarchicalAssemblyStrategy", () => { expect(versionId).toBeGreaterThan(0); // Create pages first - const pageAResult = (documentStore as any).statements.insertPage.run( + // @ts-expect-error Accessing private property for testing + const pageAResult = documentStore.statements.insertPage.run( versionId, "FileA.ts", "File A TypeScript File", null, null, "text/typescript", + 0, // depth ); - const pageAId = pageAResult.lastInsertRowid; + const pageAId = pageAResult.lastInsertRowid as number; - const pageBResult = (documentStore as any).statements.insertPage.run( + // @ts-expect-error Accessing private property for testing + const pageBResult = documentStore.statements.insertPage.run( versionId, "FileB.ts", "File B TypeScript File", null, null, "text/typescript", + 0, // depth ); - const pageBId = pageBResult.lastInsertRowid; + const pageBId = pageBResult.lastInsertRowid as number; // File A - const methodAResult = (documentStore as any).statements.insertDocument.run( + // @ts-expect-error Accessing private property for testing + const methodAResult = documentStore.statements.insertDocument.run( pageAId, " methodAlpha() { return 'Alpha'; }", JSON.stringify({ - url: "FileA.ts", path: ["FileA", "methodAlpha"], level: 2, - }), + } satisfies DbChunkMetadata), 0, ); const methodAId = methodAResult.lastInsertRowid.toString(); // File B - const methodBResult = (documentStore as any).statements.insertDocument.run( + // @ts-expect-error Accessing private property for testing + const methodBResult = documentStore.statements.insertDocument.run( pageBId, " methodBeta() { return 'Beta'; }", JSON.stringify({ - url: "FileB.ts", path: ["FileB", "methodBeta"], level: 2, - }), + } satisfies DbChunkMetadata), 0, ); const methodBId = methodBResult.lastInsertRowid.toString(); - const inputDocs: Document[] = [ + const inputDocs: DbPageChunk[] = [ { id: methodAId, - pageContent: " methodAlpha() { return 'Alpha'; }", + page_id: pageAId, + url: "FileA.ts", + title: "File A TypeScript File", + content_type: "text/typescript", + content: " methodAlpha() { return 'Alpha'; }", metadata: { - url: "FileA.ts", path: ["FileA", "methodAlpha"], level: 2, }, + sort_order: 0, + embedding: null, + created_at: new Date().toISOString(), + score: null, }, { id: methodBId, - pageContent: " methodBeta() { return 'Beta'; }", + page_id: pageBId, + url: "FileB.ts", + title: "File B TypeScript File", + content_type: "text/typescript", + content: " methodBeta() { return 'Beta'; }", metadata: { - url: "FileB.ts", path: ["FileB", "methodBeta"], level: 2, }, + sort_order: 0, + embedding: null, + created_at: new Date().toISOString(), + score: null, }, ]; @@ -475,7 +467,7 @@ describe("HierarchicalAssemblyStrategy", () => { documentStore, ); - const content = result.map((d) => d.pageContent); + const content = result.map((d) => d.content); expect(content).toContain(" methodAlpha() { return 'Alpha'; }"); expect(content).toContain(" methodBeta() { return 'Beta'; }"); }); @@ -483,22 +475,22 @@ describe("HierarchicalAssemblyStrategy", () => { describe("assembleContent", () => { it("should concatenate chunks in document order", () => { - const chunks: Document[] = [ + const chunks: DbPageChunk[] = [ { id: "1", - pageContent: "class UserService {", + content: "class UserService {", metadata: {}, - }, + } as DbPageChunk, { id: "2", - pageContent: " getUser() { return 'user'; }", + content: " getUser() { return 'user'; }", metadata: {}, - }, + } as DbPageChunk, { id: "3", - pageContent: "}", + content: "}", metadata: {}, - }, + } as DbPageChunk, ]; const result = strategy.assembleContent(chunks); @@ -511,17 +503,17 @@ describe("HierarchicalAssemblyStrategy", () => { }); it("should provide debug output when requested", () => { - const chunks: Document[] = [ + const chunks: DbPageChunk[] = [ { id: "1", - pageContent: "function test() {", + content: "function test() {", metadata: { path: ["test"], level: 0 }, - }, + } as DbPageChunk, { id: "2", - pageContent: " return 42;", + content: " return 42;", metadata: { path: ["test", "return"], level: 1 }, - }, + } as DbPageChunk, ]; const result = strategy.assembleContent(chunks, true); diff --git a/src/store/assembly/strategies/HierarchicalAssemblyStrategy.ts b/src/store/assembly/strategies/HierarchicalAssemblyStrategy.ts index a4d1f5de..a2ed5013 100644 --- a/src/store/assembly/strategies/HierarchicalAssemblyStrategy.ts +++ b/src/store/assembly/strategies/HierarchicalAssemblyStrategy.ts @@ -1,7 +1,7 @@ -import type { Document } from "@langchain/core/documents"; import { logger } from "../../../utils/logger"; import { MimeTypeUtils } from "../../../utils/mimeTypeUtils"; import type { DocumentStore } from "../../DocumentStore"; +import type { DbPageChunk } from "../../types"; import type { ContentAssemblyStrategy } from "../types"; /** @@ -44,18 +44,18 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { async selectChunks( library: string, version: string, - initialChunks: Document[], + initialChunks: DbPageChunk[], documentStore: DocumentStore, - ): Promise { + ): Promise { if (initialChunks.length === 0) { return []; } try { // Group chunks by document URL - const chunksByDocument = new Map(); + const chunksByDocument = new Map(); for (const chunk of initialChunks) { - const url = chunk.metadata.url as string; + const url = chunk.url; if (!chunksByDocument.has(url)) { chunksByDocument.set(url, []); } @@ -111,7 +111,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { } // IMPORTANT: Always include the original matched chunk first - allChunkIds.add(matched.id as string); + allChunkIds.add(matched.id); // Use promoted ancestor (may still be the original matched chunk if promotion not applicable) const ancestorParentChain = await this.walkToRoot( @@ -138,7 +138,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { // Multiple matches: use selective subtree reassembly // IMPORTANT: Always include all original matched chunks first for (const matched of documentChunks) { - allChunkIds.add(matched.id as string); + allChunkIds.add(matched.id); } const subtreeIds = await this.selectSubtreeChunks( @@ -171,18 +171,18 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { * Assembles chunks using simple concatenation. * Relies on splitter concatenation guarantees - chunks are designed to join seamlessly. */ - assembleContent(chunks: Document[], debug = false): string { + assembleContent(chunks: DbPageChunk[], debug = false): string { if (debug) { return chunks .map( (chunk) => `=== #${chunk.id} ${chunk.metadata.path?.join("/")} [${chunk.metadata.level}] ===\n` + - chunk.pageContent, + chunk.content, ) .join(""); } // Production/default: simple concatenation leveraging splitter guarantees. - return chunks.map((chunk) => chunk.pageContent).join(""); + return chunks.map((chunk) => chunk.content).join(""); } /** @@ -197,18 +197,18 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { private async walkToRoot( library: string, version: string, - chunk: Document, + chunk: DbPageChunk, documentStore: DocumentStore, ): Promise { const chainIds: string[] = []; const visited = new Set(); - let currentChunk: Document | null = chunk; + let currentChunk: DbPageChunk | null = chunk; const maxDepth = 50; // Safety limit to prevent runaway loops let depth = 0; // Walk up parent chain until we reach the root while (currentChunk && depth < maxDepth) { - const currentId = currentChunk.id as string; + const currentId = currentChunk.id; // Check for circular references if (visited.has(currentId)) { @@ -220,49 +220,21 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { chainIds.push(currentId); depth++; - try { - // Try normal parent lookup first - const parentChunk = await documentStore.findParentChunk( + // Try normal parent lookup first + let parentChunk = await documentStore.findParentChunk(library, version, currentId); + + // If no direct parent found, try gap-aware ancestor search + if (!parentChunk) { + parentChunk = await this.findAncestorWithGaps( library, version, - currentId, + currentChunk.url, + currentChunk.metadata.path ?? [], + documentStore, ); - - if (parentChunk) { - currentChunk = parentChunk; - } else { - // If normal parent lookup fails, try to find ancestors with gaps - currentChunk = await this.findAncestorWithGaps( - library, - version, - currentChunk.metadata as { url: string; path?: string[] }, - documentStore, - ); - } - } catch (error) { - // If standard lookup fails, try gap-aware ancestor search - try { - const currentMetadata = currentChunk?.metadata as { - url: string; - path?: string[]; - }; - if (currentMetadata) { - currentChunk = await this.findAncestorWithGaps( - library, - version, - currentMetadata, - documentStore, - ); - } else { - currentChunk = null; - } - } catch (gapError) { - logger.warn( - `Parent lookup failed for chunk ${currentId}: ${error}. Gap search also failed: ${gapError}`, - ); - break; - } } + + currentChunk = parentChunk; } if (depth >= maxDepth) { @@ -281,12 +253,10 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { private async findAncestorWithGaps( library: string, version: string, - metadata: { url: string; path?: string[] }, + url: string, + path: string[], documentStore: DocumentStore, - ): Promise { - const path = metadata.path || []; - const url = metadata.url; - + ): Promise { if (path.length <= 1) { return null; // Already at or near root } @@ -331,7 +301,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { url: string, targetPath: string[], documentStore: DocumentStore, - ): Promise { + ): Promise { try { // Get all chunks from the same document URL const allChunks = await documentStore.findChunksByUrl(library, version, url); @@ -342,7 +312,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { const matchingChunks = allChunks.filter((chunk) => { const chunkPath = (chunk.metadata.path as string[]) || []; - const chunkUrl = chunk.metadata.url as string; + const chunkUrl = chunk.url; // Must be in the same document if (chunkUrl !== url) return false; @@ -368,13 +338,13 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { private async findStructuralAncestor( library: string, version: string, - chunk: Document, + chunk: DbPageChunk, documentStore: DocumentStore, - ): Promise { - let current: Document | null = chunk; + ): Promise { + let current: DbPageChunk | null = chunk; // If current is structural already, return it - const isStructural = (c: Document | null) => + const isStructural = (c: DbPageChunk | null) => !!c && Array.isArray(c.metadata?.types) && c.metadata.types.includes("structural"); if (isStructural(current)) { @@ -383,11 +353,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { // Walk up until we find a structural ancestor while (true) { - const parent = await documentStore.findParentChunk( - library, - version, - current.id as string, - ); + const parent = await documentStore.findParentChunk(library, version, current.id); if (!parent) { return null; } @@ -405,7 +371,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { private async selectSubtreeChunks( library: string, version: string, - documentChunks: Document[], + documentChunks: DbPageChunk[], documentStore: DocumentStore, ): Promise { const chunkIds = new Set(); @@ -458,7 +424,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { /** * Finds the common ancestor path from a list of chunks by finding the longest common prefix. */ - private findCommonAncestorPath(chunks: Document[]): string[] { + private findCommonAncestorPath(chunks: DbPageChunk[]): string[] { if (chunks.length === 0) return []; if (chunks.length === 1) return (chunks[0].metadata.path as string[]) ?? []; @@ -488,7 +454,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { private async findContainerChunks( library: string, version: string, - referenceChunk: Document, + referenceChunk: DbPageChunk, ancestorPath: string[], documentStore: DocumentStore, ): Promise { @@ -500,13 +466,13 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { const ancestorChunks = await this.findChunksByExactPath( library, version, - referenceChunk.metadata.url as string, + referenceChunk.url, ancestorPath, documentStore, ); for (const chunk of ancestorChunks) { - containerIds.push(chunk.id as string); + containerIds.push(chunk.id); } } catch (error) { logger.warn( @@ -527,7 +493,7 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { url: string, path: string[], documentStore: DocumentStore, - ): Promise { + ): Promise { try { // For root path, return empty - no specific chunks to find if (path.length === 0) { @@ -569,17 +535,17 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { private async findSubtreeChunks( library: string, version: string, - rootChunk: Document, + rootChunk: DbPageChunk, documentStore: DocumentStore, ): Promise { const subtreeIds: string[] = []; const visited = new Set(); - const queue: Document[] = [rootChunk]; + const queue: DbPageChunk[] = [rootChunk]; while (queue.length > 0) { // biome-ignore lint/style/noNonNullAssertion: this is safe due to the while condition const currentChunk = queue.shift()!; - const currentId = currentChunk.id as string; + const currentId = currentChunk.id; if (visited.has(currentId)) continue; visited.add(currentId); @@ -609,24 +575,20 @@ export class HierarchicalAssemblyStrategy implements ContentAssemblyStrategy { private async fallbackSelection( library: string, version: string, - initialChunks: Document[], + initialChunks: DbPageChunk[], documentStore: DocumentStore, - ): Promise { + ): Promise { const chunkIds = new Set(); // Just include the initial chunks and their immediate parents/children for (const chunk of initialChunks) { - const id = chunk.id as string; + const id = chunk.id; chunkIds.add(id); // Add parent for context - try { - const parent = await documentStore.findParentChunk(library, version, id); - if (parent) { - chunkIds.add(parent.id as string); - } - } catch (error) { - logger.warn(`Failed to find parent for chunk ${id}: ${error}`); + const parent = await documentStore.findParentChunk(library, version, id); + if (parent) { + chunkIds.add(parent.id); } // Add direct children (limited) diff --git a/src/store/assembly/strategies/MarkdownAssemblyStrategy.test.ts b/src/store/assembly/strategies/MarkdownAssemblyStrategy.test.ts index f22e1613..ef56e38b 100644 --- a/src/store/assembly/strategies/MarkdownAssemblyStrategy.test.ts +++ b/src/store/assembly/strategies/MarkdownAssemblyStrategy.test.ts @@ -1,6 +1,6 @@ -import { Document } from "@langchain/core/documents"; import { beforeEach, describe, expect, it, vi } from "vitest"; import type { DocumentStore } from "../../DocumentStore"; +import type { DbPageChunk } from "../../types"; import { MarkdownAssemblyStrategy } from "./MarkdownAssemblyStrategy"; // Mock DocumentStore with just the methods we need @@ -17,88 +17,96 @@ const createMockDocumentStore = () => const createDocumentUniverse = () => { return { // Target chunk (the one we're finding relations for) - target: new Document({ + target: { id: "target", - pageContent: "Target content", - metadata: { url: "https://example.com", path: ["Chapter 1", "Section 1.1"] }, - }), + content: "Target content", + url: "https://example.com", + metadata: { path: ["Chapter 1", "Section 1.1"] }, + } as DbPageChunk, // Parent - parent: new Document({ + parent: { id: "parent", - pageContent: "Parent section content", - metadata: { url: "https://example.com", path: ["Chapter 1"] }, - }), + content: "Parent section content", + url: "https://example.com", + metadata: { path: ["Chapter 1"] }, + } as DbPageChunk, // Children (limit = 3, so child4 should be excluded) - child1: new Document({ + child1: { id: "child1", - pageContent: "First child content", + content: "First child content", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1", "Section 1.1", "Subsection A"], }, - }), - child2: new Document({ + } as DbPageChunk, + child2: { id: "child2", - pageContent: "Second child content", + content: "Second child content", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1", "Section 1.1", "Subsection B"], }, - }), - child3: new Document({ + } as DbPageChunk, + child3: { id: "child3", - pageContent: "Third child content", + content: "Third child content", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1", "Section 1.1", "Subsection C"], }, - }), - child4: new Document({ + } as DbPageChunk, + child4: { id: "child4", - pageContent: "Fourth child content (should be excluded)", + content: "Fourth child content (should be excluded)", + url: "https://example.com", metadata: { - url: "https://example.com", path: ["Chapter 1", "Section 1.1", "Subsection D"], }, - }), + } as DbPageChunk, // Preceding siblings (limit = 1, so only prev1 should be included) - prev1: new Document({ + prev1: { id: "prev1", - pageContent: "Previous sibling 1", - metadata: { url: "https://example.com", path: ["Chapter 1", "Section 1.0"] }, - }), - prev2: new Document({ + content: "Previous sibling 1", + url: "https://example.com", + metadata: { path: ["Chapter 1", "Section 1.0"] }, + } as DbPageChunk, + prev2: { id: "prev2", - pageContent: "Previous sibling 2 (should be excluded)", - metadata: { url: "https://example.com", path: ["Chapter 1", "Section 0.9"] }, - }), + content: "Previous sibling 2 (should be excluded)", + url: "https://example.com", + metadata: { path: ["Chapter 1", "Section 0.9"] }, + } as DbPageChunk, // Subsequent siblings (limit = 2) - next1: new Document({ + next1: { id: "next1", - pageContent: "Next sibling 1", - metadata: { url: "https://example.com", path: ["Chapter 1", "Section 1.2"] }, - }), - next2: new Document({ + content: "Next sibling 1", + url: "https://example.com", + metadata: { path: ["Chapter 1", "Section 1.2"] }, + } as DbPageChunk, + next2: { id: "next2", - pageContent: "Next sibling 2", - metadata: { url: "https://example.com", path: ["Chapter 1", "Section 1.3"] }, - }), - next3: new Document({ + content: "Next sibling 2", + url: "https://example.com", + metadata: { path: ["Chapter 1", "Section 1.3"] }, + } as DbPageChunk, + next3: { id: "next3", - pageContent: "Next sibling 3 (should be excluded)", - metadata: { url: "https://example.com", path: ["Chapter 1", "Section 1.4"] }, - }), + content: "Next sibling 3 (should be excluded)", + url: "https://example.com", + metadata: { path: ["Chapter 1", "Section 1.4"] }, + } as DbPageChunk, // Orphan chunk (no relations) - orphan: new Document({ + orphan: { id: "orphan", - pageContent: "Orphan content", - metadata: { url: "https://example.com/other", path: ["Standalone"] }, - }), + content: "Orphan content", + url: "https://example.com/other", + metadata: { path: ["Standalone"] }, + } as DbPageChunk, }; }; @@ -206,11 +214,11 @@ describe("MarkdownAssemblyStrategy", () => { }); it("handles chunks with existing newlines", () => { - const chunkWithNewlines = new Document({ + const chunkWithNewlines = { id: "newlines", - pageContent: "Line 1\nLine 2\n\nLine 4", + content: "Line 1\nLine 2\n\nLine 4", metadata: {}, - }); + } as DbPageChunk; const result = strategy.assembleContent([universe.target, chunkWithNewlines]); expect(result).toBe("Target content\n\nLine 1\nLine 2\n\nLine 4"); @@ -575,10 +583,10 @@ describe("MarkdownAssemblyStrategy", () => { }); it("handles chunks without IDs gracefully", async () => { - const invalidChunk = new Document({ - pageContent: "No ID chunk", + const invalidChunk = { + content: "No ID chunk", metadata: {}, - }); + } as DbPageChunk; // Mock all store methods to return empty arrays for undefined IDs vi.mocked(mockStore.findParentChunk).mockResolvedValue(null); diff --git a/src/store/assembly/strategies/MarkdownAssemblyStrategy.ts b/src/store/assembly/strategies/MarkdownAssemblyStrategy.ts index 9a4f61c3..8c391370 100644 --- a/src/store/assembly/strategies/MarkdownAssemblyStrategy.ts +++ b/src/store/assembly/strategies/MarkdownAssemblyStrategy.ts @@ -1,6 +1,6 @@ -import type { Document } from "@langchain/core/documents"; import { MimeTypeUtils } from "../../../utils/mimeTypeUtils"; import type { DocumentStore } from "../../DocumentStore"; +import type { DbPageChunk } from "../../types"; import type { ContentAssemblyStrategy } from "../types"; const CHILD_LIMIT = 3; @@ -53,9 +53,9 @@ export class MarkdownAssemblyStrategy implements ContentAssemblyStrategy { async selectChunks( library: string, version: string, - initialChunks: Document[], + initialChunks: DbPageChunk[], documentStore: DocumentStore, - ): Promise { + ): Promise { const allChunkIds = new Set(); // Process all initial chunks in parallel to gather related chunk IDs @@ -82,8 +82,8 @@ export class MarkdownAssemblyStrategy implements ContentAssemblyStrategy { /** * Assembles chunks using simple "\n\n" joining (current behavior). */ - assembleContent(chunks: Document[]): string { - return chunks.map((chunk) => chunk.pageContent).join("\n\n"); + assembleContent(chunks: DbPageChunk[]): string { + return chunks.map((chunk) => chunk.content).join("\n\n"); } /** @@ -93,10 +93,10 @@ export class MarkdownAssemblyStrategy implements ContentAssemblyStrategy { private async getRelatedChunkIds( library: string, version: string, - doc: Document, + doc: DbPageChunk, documentStore: DocumentStore, ): Promise> { - const id = doc.id as string; + const id = doc.id; const relatedIds = new Set(); // Add the original chunk @@ -105,7 +105,7 @@ export class MarkdownAssemblyStrategy implements ContentAssemblyStrategy { // Parent const parent = await documentStore.findParentChunk(library, version, id); if (parent) { - relatedIds.add(parent.id as string); + relatedIds.add(parent.id); } // Preceding Siblings @@ -116,7 +116,7 @@ export class MarkdownAssemblyStrategy implements ContentAssemblyStrategy { PRECEDING_SIBLINGS_LIMIT, ); for (const sib of precedingSiblings) { - relatedIds.add(sib.id as string); + relatedIds.add(sib.id); } // Child Chunks @@ -127,7 +127,7 @@ export class MarkdownAssemblyStrategy implements ContentAssemblyStrategy { CHILD_LIMIT, ); for (const child of childChunks) { - relatedIds.add(child.id as string); + relatedIds.add(child.id); } // Subsequent Siblings @@ -138,7 +138,7 @@ export class MarkdownAssemblyStrategy implements ContentAssemblyStrategy { SUBSEQUENT_SIBLINGS_LIMIT, ); for (const sib of subsequentSiblings) { - relatedIds.add(sib.id as string); + relatedIds.add(sib.id); } return relatedIds; diff --git a/src/store/assembly/types.ts b/src/store/assembly/types.ts index bf926f17..a0c2a8ac 100644 --- a/src/store/assembly/types.ts +++ b/src/store/assembly/types.ts @@ -1,5 +1,5 @@ -import type { Document } from "@langchain/core/documents"; import type { DocumentStore } from "../DocumentStore"; +import type { DbPageChunk } from "../types"; /** * Strategy interface for content-type-aware search result assembly. @@ -28,9 +28,9 @@ export interface ContentAssemblyStrategy { selectChunks( library: string, version: string, - initialChunks: Document[], + initialChunks: DbPageChunk[], documentStore: DocumentStore, - ): Promise; + ): Promise; /** * Assembles the selected chunks into final content. @@ -38,7 +38,7 @@ export interface ContentAssemblyStrategy { * @param chunks The chunks to assemble (already in proper order) * @returns The assembled content string */ - assembleContent(chunks: Document[]): string; + assembleContent(chunks: DbPageChunk[]): string; } /** @@ -46,7 +46,7 @@ export interface ContentAssemblyStrategy { */ export interface ContentAssemblyContext { /** The chunks that matched the search query */ - initialChunks: Document[]; + initialChunks: DbPageChunk[]; /** MIME type of the content (from first chunk metadata) */ mimeType?: string; /** Document URL for grouping */ @@ -60,7 +60,7 @@ export interface ContentAssemblyContext { */ export interface ChunkSelectionResult { /** Selected chunks in proper order for assembly */ - chunks: Document[]; + chunks: DbPageChunk[]; /** Strategy that was used for selection */ strategy: string; /** Any warnings or notes about the selection process */ diff --git a/src/store/types.ts b/src/store/types.ts index 358d1622..5a734eb3 100644 --- a/src/store/types.ts +++ b/src/store/types.ts @@ -1,5 +1,4 @@ import type { ScrapeMode } from "../scraper/types"; -import type { DocumentMetadata } from "../types"; /** Default vector dimension used across the application */ export const VECTOR_DIMENSION = 1536; @@ -15,18 +14,33 @@ export interface DbPage { etag: string | null; last_modified: string | null; content_type: string | null; + depth: number | null; created_at: string; updated_at: string; } +/** + * Chunk-level metadata stored with each document chunk. + * Contains hierarchical information about the chunk's position within the page. + */ +export interface DbChunkMetadata { + level?: number; // Hierarchical level in document + path?: string[]; // Hierarchical path in document + // TODO: Check if `types` is properly used + types?: string[]; // Types of content in this chunk (e.g., "text", "code", "table") + // TODO: Enable additional metadata fields again once we have a clear schema for what metadata we want to store with each chunk. + // Allow for additional chunk-specific metadata + // [key: string]: unknown; +} + /** * Database document record type matching the documents table schema */ -export interface DbDocument { +export interface DbChunk { id: string; page_id: number; // Foreign key to pages table content: string; - metadata: string; // JSON string of chunk-specific metadata (level, path, etc.) + metadata: DbChunkMetadata; // Chunk-specific metadata (level, path, etc.) sort_order: number; embedding: Buffer | null; // Binary blob for embeddings created_at: string; @@ -37,36 +51,26 @@ export interface DbDocument { * Represents the result of a JOIN between the documents and pages tables. * It includes all fields from a document chunk plus the relevant page-level metadata. */ -export interface DbJoinedDocument extends DbDocument { +export interface DbPageChunk extends DbChunk { url: string; - title: string | null; - content_type: string | null; + title?: string | null; + content_type?: string | null; } /** - * Utility type for handling SQLite query results that may be undefined + * Represents the ranking information for a search result, including both + * vector and full-text search ranks. */ -export type DbQueryResult = T | undefined; +export interface DbChunkRank { + score: number; + vec_rank?: number; + fts_rank?: number; +} /** - * Maps raw database document with joined page data to the Document type used by the application. - * Now uses the explicit DbJoinedDocument type for improved type safety. + * Utility type for handling SQLite query results that may be undefined */ -export function mapDbDocumentToDocument(doc: DbJoinedDocument) { - const chunkMetadata = JSON.parse(doc.metadata) as DocumentMetadata; - - return { - id: doc.id, - pageContent: doc.content, - metadata: { - ...chunkMetadata, - // Page-level fields are always available from joined queries - url: doc.url, - title: doc.title || "", // Convert null to empty string for consistency - ...(doc.content_type && { contentType: doc.content_type }), - } as DocumentMetadata, - }; -} +export type DbQueryResult = T | undefined; /** * Search result type returned by the DocumentRetrieverService @@ -75,7 +79,7 @@ export interface StoreSearchResult { url: string; content: string; score: number | null; - mimeType?: string; + mimeType?: string | null; } /** @@ -299,3 +303,20 @@ export function isActiveStatus(status: VersionStatus): boolean { status, ); } + +/** + * Library version row returned by queryLibraryVersions. + * Aggregates version metadata with document counts and indexing status. + */ +export interface DbLibraryVersion { + library: string; + version: string; + versionId: number; + status: VersionStatus; + progressPages: number; + progressMaxPages: number; + sourceUrl: string | null; + documentCount: number; + uniqueUrlCount: number; + indexedAt: string | null; +} diff --git a/src/tools/FetchUrlTool.ts b/src/tools/FetchUrlTool.ts index 5b0e3fdf..669a31b6 100644 --- a/src/tools/FetchUrlTool.ts +++ b/src/tools/FetchUrlTool.ts @@ -2,7 +2,7 @@ import type { AutoDetectFetcher, RawContent } from "../scraper/fetcher"; import { HtmlPipeline } from "../scraper/pipelines/HtmlPipeline"; import { MarkdownPipeline } from "../scraper/pipelines/MarkdownPipeline"; import { TextPipeline } from "../scraper/pipelines/TextPipeline"; -import type { ContentPipeline, ProcessedContent } from "../scraper/pipelines/types"; +import type { ContentPipeline, PipelineResult } from "../scraper/pipelines/types"; import { ScrapeMode } from "../scraper/types"; import { convertToString } from "../scraper/utils/buffer"; import { resolveCharset } from "../scraper/utils/charset"; @@ -96,9 +96,9 @@ export class FetchUrlTool { logger.info("🔄 Processing content..."); - let processed: Awaited | undefined; + let processed: Awaited | undefined; for (const pipeline of this.pipelines) { - if (pipeline.canProcess(rawContent)) { + if (pipeline.canProcess(rawContent.mimeType, rawContent.content)) { processed = await pipeline.process( rawContent, { @@ -135,7 +135,7 @@ export class FetchUrlTool { return contentString; } - for (const err of processed.errors) { + for (const err of processed.errors ?? []) { logger.warn(`⚠️ Processing error for ${url}: ${err.message}`); } diff --git a/src/tools/ListJobsTool.test.ts b/src/tools/ListJobsTool.test.ts index b6b8cdc0..c3ed6f45 100644 --- a/src/tools/ListJobsTool.test.ts +++ b/src/tools/ListJobsTool.test.ts @@ -1,7 +1,7 @@ import { beforeEach, describe, expect, it, type Mock, vi } from "vitest"; import type { PipelineManager } from "../pipeline/PipelineManager"; import { type PipelineJob, PipelineJobStatus } from "../pipeline/types"; -import type { ScraperProgress } from "../scraper/types"; +import type { ScraperProgressEvent } from "../scraper/types"; import { VersionStatus } from "../store/types"; import { ListJobsTool } from "./ListJobsTool"; @@ -49,7 +49,7 @@ describe("ListJobsTool", () => { currentUrl: "url2/page5", depth: 1, maxDepth: 3, - } as ScraperProgress, + } as ScraperProgressEvent, error: null, finishedAt: null, // Database fields @@ -60,7 +60,12 @@ describe("ListJobsTool", () => { errorMessage: null, updatedAt: new Date("2023-01-01T11:05:00Z"), sourceUrl: "url2", - scraperOptions: { maxDepth: 3 }, + scraperOptions: { + url: "url2", + library: "lib-b", + version: "2.0.0", + maxDepth: 3, + }, }, { id: "job-3", @@ -76,7 +81,7 @@ describe("ListJobsTool", () => { currentUrl: "url3/page10", depth: 2, maxDepth: 2, - } as ScraperProgress, + } as ScraperProgressEvent, error: null, // Database fields versionId: 3, @@ -86,7 +91,12 @@ describe("ListJobsTool", () => { errorMessage: null, updatedAt: new Date("2023-01-01T12:15:00Z"), sourceUrl: "url3", - scraperOptions: { maxDepth: 2 }, + scraperOptions: { + url: "url3", + library: "lib-a", + version: "1.1.0", + maxDepth: 2, + }, }, ]; diff --git a/src/tools/RefreshVersionTool.ts b/src/tools/RefreshVersionTool.ts new file mode 100644 index 00000000..97140529 --- /dev/null +++ b/src/tools/RefreshVersionTool.ts @@ -0,0 +1,95 @@ +import * as semver from "semver"; +import type { IPipeline } from "../pipeline/trpc/interfaces"; +import { logger } from "../utils/logger"; +import { ValidationError } from "./errors"; + +export interface RefreshVersionToolOptions { + library: string; + version?: string | null; // Make version optional + /** If false, returns jobId immediately without waiting. Defaults to true. */ + waitForCompletion?: boolean; +} + +export interface RefreshResult { + /** Indicates the number of pages refreshed if waitForCompletion was true and the job succeeded. May be 0 or inaccurate if job failed or waitForCompletion was false. */ + pagesRefreshed: number; +} + +/** Return type for RefreshVersionTool.execute */ +export type RefreshExecuteResult = RefreshResult | { jobId: string }; + +/** + * Tool for refreshing an existing library version by re-scraping all pages + * and using ETag comparison to skip unchanged content. + */ +export class RefreshVersionTool { + private pipeline: IPipeline; + + constructor(pipeline: IPipeline) { + this.pipeline = pipeline; + } + + async execute(options: RefreshVersionToolOptions): Promise { + const { library, version, waitForCompletion = true } = options; + + let internalVersion: string; + const partialVersionRegex = /^\d+(\.\d+)?$/; // Matches '1' or '1.2' + + if (version === null || version === undefined) { + internalVersion = ""; + } else { + const validFullVersion = semver.valid(version); + if (validFullVersion) { + internalVersion = validFullVersion; + } else if (partialVersionRegex.test(version)) { + const coercedVersion = semver.coerce(version); + if (coercedVersion) { + internalVersion = coercedVersion.version; + } else { + throw new ValidationError( + `Invalid version format for refreshing: '${version}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`, + "RefreshVersionTool", + ); + } + } else { + throw new ValidationError( + `Invalid version format for refreshing: '${version}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`, + "RefreshVersionTool", + ); + } + } + + internalVersion = internalVersion.toLowerCase(); + + // Use the injected pipeline instance + const pipeline = this.pipeline; + + // Normalize pipeline version argument: use null for unversioned to be explicit cross-platform + const refreshVersion: string | null = internalVersion === "" ? null : internalVersion; + + // Enqueue the refresh job using the injected pipeline + const jobId = await pipeline.enqueueRefreshJob(library, refreshVersion); + + // Conditionally wait for completion + if (waitForCompletion) { + try { + await pipeline.waitForJobCompletion(jobId); + // Fetch final job state to get status and potentially final page count + const finalJob = await pipeline.getJob(jobId); + const finalPagesRefreshed = finalJob?.progress?.pagesScraped ?? 0; // Get count from final job state + logger.debug( + `Refresh job ${jobId} finished with status ${finalJob?.status}. Pages refreshed: ${finalPagesRefreshed}`, + ); + return { + pagesRefreshed: finalPagesRefreshed, + }; + } catch (error) { + logger.error(`❌ Refresh job ${jobId} failed or was cancelled: ${error}`); + throw error; // Re-throw so the caller knows it failed + } + } + + // If not waiting, return the job ID immediately + return { jobId }; + } +} diff --git a/src/tools/ScrapeTool.test.ts b/src/tools/ScrapeTool.test.ts index 567b3376..c04d88f9 100644 --- a/src/tools/ScrapeTool.test.ts +++ b/src/tools/ScrapeTool.test.ts @@ -21,7 +21,7 @@ describe("ScrapeTool", () => { // Mock the manager instance methods mockManagerInstance = { start: vi.fn().mockResolvedValue(undefined), - enqueueJob: vi.fn().mockResolvedValue(MOCK_JOB_ID), // Return a mock job ID + enqueueScrapeJob: vi.fn().mockResolvedValue(MOCK_JOB_ID), // Return a mock job ID waitForJobCompletion: vi.fn().mockResolvedValue(undefined), // Default success getJob: vi.fn().mockResolvedValue({ // Mock getJob for final status check @@ -64,13 +64,13 @@ describe("ScrapeTool", () => { const options = getBaseOptions(input); await scrapeTool.execute(options); - // Check enqueueJob call (implies constructor was called) + // Check enqueueScrapeJob call (implies constructor was called) const expectedVersionArg = typeof expectedInternal === "string" ? expectedInternal.toLowerCase() : expectedInternal; // null stays null - expect(mockManagerInstance.enqueueJob).toHaveBeenCalledWith( + expect(mockManagerInstance.enqueueScrapeJob).toHaveBeenCalledWith( "test-lib", expectedVersionArg, expect.objectContaining({ url: options.url }), // Check basic options passed @@ -87,7 +87,7 @@ describe("ScrapeTool", () => { await expect(scrapeTool.execute(options)).rejects.toThrow( /Invalid version format for scraping/, ); - expect(mockManagerInstance.enqueueJob).not.toHaveBeenCalled(); + expect(mockManagerInstance.enqueueScrapeJob).not.toHaveBeenCalled(); }, ); @@ -105,8 +105,8 @@ describe("ScrapeTool", () => { }; await scrapeTool.execute(options); - // Check enqueueJob options - expect(mockManagerInstance.enqueueJob).toHaveBeenCalledWith( + // Check enqueueScrapeJob options + expect(mockManagerInstance.enqueueScrapeJob).toHaveBeenCalledWith( "test-lib", "1.0.0", // Normalized and lowercased { @@ -148,7 +148,7 @@ describe("ScrapeTool", () => { const result = await scrapeTool.execute(options); expect(result).toEqual({ jobId: MOCK_JOB_ID }); - expect(mockManagerInstance.enqueueJob).toHaveBeenCalledOnce(); + expect(mockManagerInstance.enqueueScrapeJob).toHaveBeenCalledOnce(); expect(mockManagerInstance.waitForJobCompletion).not.toHaveBeenCalled(); // Should not wait }); @@ -156,7 +156,7 @@ describe("ScrapeTool", () => { const options = getBaseOptions("1.0.0"); // waitForCompletion is omitted (defaults to true) await scrapeTool.execute(options); - expect(mockManagerInstance.enqueueJob).toHaveBeenCalledOnce(); + expect(mockManagerInstance.enqueueScrapeJob).toHaveBeenCalledOnce(); expect(mockManagerInstance.waitForJobCompletion).toHaveBeenCalledWith(MOCK_JOB_ID); // Should wait }); @@ -166,7 +166,7 @@ describe("ScrapeTool", () => { (mockManagerInstance.waitForJobCompletion as Mock).mockRejectedValue(jobError); await expect(scrapeTool.execute(options)).rejects.toThrow("Job failed"); - expect(mockManagerInstance.enqueueJob).toHaveBeenCalledOnce(); // Job was still enqueued + expect(mockManagerInstance.enqueueScrapeJob).toHaveBeenCalledOnce(); // Job was still enqueued }); it("should pass custom headers to the pipeline manager", async () => { @@ -180,7 +180,7 @@ describe("ScrapeTool", () => { }, }; await scrapeTool.execute(options); - expect(mockManagerInstance.enqueueJob).toHaveBeenCalledWith( + expect(mockManagerInstance.enqueueScrapeJob).toHaveBeenCalledWith( "test-lib", "2.0.0", expect.objectContaining({ diff --git a/src/tools/ScrapeTool.ts b/src/tools/ScrapeTool.ts index 4208cf2f..c98ddbf6 100644 --- a/src/tools/ScrapeTool.ts +++ b/src/tools/ScrapeTool.ts @@ -132,7 +132,7 @@ export class ScrapeTool { const enqueueVersion: string | null = internalVersion === "" ? null : internalVersion; // Enqueue the job using the injected pipeline - const jobId = await pipeline.enqueueJob(library, enqueueVersion, { + const jobId = await pipeline.enqueueScrapeJob(library, enqueueVersion, { url: url, library: library, version: internalVersion, diff --git a/src/tools/index.ts b/src/tools/index.ts index aa659e6e..8b8b8fbe 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -6,6 +6,7 @@ export * from "./FindVersionTool"; export * from "./GetJobInfoTool"; export * from "./ListJobsTool"; export * from "./ListLibrariesTool"; +export * from "./RefreshVersionTool"; export * from "./RemoveTool"; export * from "./ScrapeTool"; export * from "./SearchTool"; diff --git a/src/types/index.ts b/src/types/index.ts index 0a34eac8..8acee190 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -1,46 +1,3 @@ -/** - * Common document content type shared across modules - */ -export interface Document { - content: string; - metadata: DocumentMetadata; - contentType?: string; // MIME type of the original content -} - -/** - * Page-level metadata stored in the pages table - */ -export interface PageMetadata { - url: string; - title: string; - etag?: string; - lastModified?: string; - contentType?: string; -} - -/** - * Chunk-level metadata stored with each document chunk - */ -export interface ChunkMetadata { - level?: number; // Hierarchical level in document - path?: string[]; // Hierarchical path in document - // Allow for additional chunk-specific metadata - [key: string]: unknown; -} - -/** - * Common metadata fields shared across document chunks - * This combines page-level and chunk-level metadata for backward compatibility - */ -export interface DocumentMetadata extends ChunkMetadata { - url: string; - title: string; - library: string; - version: string; - level?: number; // Optional during scraping - path?: string[]; // Optional during scraping -} - /** * Generic progress callback type */ diff --git a/test/refresh-pipeline-e2e.test.ts b/test/refresh-pipeline-e2e.test.ts new file mode 100644 index 00000000..e0af9dca --- /dev/null +++ b/test/refresh-pipeline-e2e.test.ts @@ -0,0 +1,765 @@ +/** + * End-to-end tests for the refresh pipeline functionality. + * + * These tests validate that the refresh feature correctly handles: + * - Page deletions (404 responses) + * - Page updates (200 responses with new content) + * - Unchanged pages (304 responses) + * - Graceful error handling for broken links during normal scraping + * + * Uses nock to mock HTTP responses and an in-memory database for testing. + */ + +import nock from "nock"; +import { vol } from "memfs"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { PipelineManager } from "../src/pipeline/PipelineManager"; +import { ScraperService } from "../src/scraper/ScraperService"; +import type { ScraperOptions } from "../src/scraper/types"; +import { DocumentManagementService } from "../src/store/DocumentManagementService"; +import type { StoreSearchResult } from "../src/store/types"; +import { ScraperRegistry } from "../src/scraper"; + +// Mock file system for file-based tests +vi.mock("node:fs/promises", () => ({ default: vol.promises })); + +describe("Refresh Pipeline E2E Tests", () => { + let docService: DocumentManagementService; + let scraperService: ScraperService; + let pipelineManager: PipelineManager; + + const TEST_BASE_URL = "http://test-docs.example.com"; + const TEST_LIBRARY = "test-lib"; + const TEST_VERSION = "1.0.0"; + + beforeEach(async () => { + // Initialize in-memory store and services + // DocumentManagementService creates its own DocumentStore internally + docService = new DocumentManagementService(":memory:", null); + await docService.initialize(); + const registry = new ScraperRegistry(); + scraperService = new ScraperService(registry); + pipelineManager = new PipelineManager(docService, 3, { recoverJobs: false }); + await pipelineManager.start(); + + // Clear any previous nock mocks + nock.cleanAll(); + }); + + afterEach(async () => { + // Cleanup + await pipelineManager.stop(); + await docService.shutdown(); + nock.cleanAll(); + vol.reset(); + }); + + describe("Refresh Scenarios", () => { + it("should delete documents when a page returns 404 during refresh", async () => { + // Setup: Mock initial two-page site + nock(TEST_BASE_URL) + .get("/") + .reply(200, "

Home

Page 1Page 2", { + "Content-Type": "text/html", + ETag: '"home-v1"', + }) + .get("/page1") + .reply(200, "

Page 1

Content of page 1

", { + "Content-Type": "text/html", + ETag: '"page1-v1"', + }) + .get("/page2") + .reply(200, "

Page 2

Content of page 2

", { + "Content-Type": "text/html", + ETag: '"page2-v1"', + }); + + // Initial scrape + const initialJobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + // Wait for job to complete + await pipelineManager.waitForJobCompletion(initialJobId); + + // Verify all pages were indexed + const initialSearch = await docService.searchStore(TEST_LIBRARY, TEST_VERSION, "page", 10); + expect(initialSearch.length).toBeGreaterThan(0); + + // Get page IDs for verification + const pages = await docService.getPagesByVersionId( + await docService.ensureVersion({ library: TEST_LIBRARY, version: TEST_VERSION }), + ); + expect(pages.length).toBe(3); // home, page1, page2 + + const page2 = pages.find((p) => p.url === `${TEST_BASE_URL}/page2`); + expect(page2).toBeDefined(); + + // Setup: Mock refresh with page2 deleted (404) + // Enable nock logging to see what requests are made + nock(TEST_BASE_URL) + .get("/") + .matchHeader("if-none-match", '"home-v1"') + .reply(304, undefined, { ETag: '"home-v1"' }) // Unchanged + .get("/page1") + .matchHeader("if-none-match", '"page1-v1"') + .reply(304, undefined, { ETag: '"page1-v1"' }) // Unchanged + .get("/page2") + .matchHeader("if-none-match", '"page2-v1"') + .reply(404); // Deleted! + + // Execute refresh + const refreshJobId = await pipelineManager.enqueueRefreshJob(TEST_LIBRARY, TEST_VERSION); + await pipelineManager.waitForJobCompletion(refreshJobId); + + // Verify page2 documents were deleted by checking if we can still find page2 content + // Use a unique phrase that only appears in page2 to avoid false positives from keyword matching + const page2Search = await docService.searchStore(TEST_LIBRARY, TEST_VERSION, "Content of page", 10); + const hasPage2Content = page2Search.some((r: StoreSearchResult) => + r.url === `${TEST_BASE_URL}/page2` + ); + expect(hasPage2Content).toBe(false); + + // Verify page1 documents still exist + const page1Search = await docService.searchStore(TEST_LIBRARY, TEST_VERSION, "Content of page 1", 10); + expect(page1Search.length).toBeGreaterThan(0); + }, 30000); + + it("should update documents when a page has changed content during refresh", async () => { + const originalContent = "Original content version 1"; + const updatedContent = "Updated content version 2"; + + // Setup: Mock initial site + nock(TEST_BASE_URL) + .get("/") + .reply( + 200, + `

Home

Page 1`, + { + "Content-Type": "text/html", + ETag: '"home-v1"', + }, + ) + .get("/page1") + .reply( + 200, + `

Page 1

${originalContent}

`, + { + "Content-Type": "text/html", + ETag: '"page1-v1"', + }, + ); + + // Initial scrape + const initialJobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + await pipelineManager.waitForJobCompletion(initialJobId); + + // Verify original content is indexed + const initialSearch = await docService.searchStore( + TEST_LIBRARY, + TEST_VERSION, + "original content", + 10, + ); + expect(initialSearch.length).toBeGreaterThan(0); + expect(initialSearch[0].content).toContain(originalContent); + + // Setup: Mock refresh with updated page1 content + nock(TEST_BASE_URL) + .get("/") + .reply(304, undefined, { ETag: '"home-v1"' }) // Unchanged + .get("/page1") + .reply( + 200, + `

Page 1

${updatedContent}

`, + { + "Content-Type": "text/html", + ETag: '"page1-v2"', // New ETag indicates change + }, + ); + + // Execute refresh + const refreshJobId = await pipelineManager.enqueueRefreshJob(TEST_LIBRARY, TEST_VERSION); + await pipelineManager.waitForJobCompletion(refreshJobId); + + // Verify updated content is now indexed + const updatedSearch = await docService.searchStore( + TEST_LIBRARY, + TEST_VERSION, + "updated content", + 10, + ); + expect(updatedSearch.length).toBeGreaterThan(0); + expect(updatedSearch[0].content).toContain(updatedContent); + + // Verify old content is no longer indexed + const oldSearch = await docService.searchStore( + TEST_LIBRARY, + TEST_VERSION, + "original content", + 10, + ); + const hasOldContent = oldSearch.some((r: StoreSearchResult) => r.content.includes(originalContent)); + expect(hasOldContent).toBe(false); + }, 30000); + + it("should skip processing when pages return 304 Not Modified", async () => { + // Setup: Mock initial site + nock(TEST_BASE_URL) + .get("/") + .reply(200, "

Home

Page 1", { + "Content-Type": "text/html", + ETag: '"home-v1"', + }) + .get("/page1") + .reply(200, "

Page 1

Stable content

", { + "Content-Type": "text/html", + ETag: '"page1-v1"', + }); + + // Initial scrape + const initialJobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + await pipelineManager.waitForJobCompletion(initialJobId); + + // Get initial document count + const versionId = await docService.ensureVersion({ + library: TEST_LIBRARY, + version: TEST_VERSION, + }); + const initialPages = await docService.getPagesByVersionId(versionId); + const initialPageCount = initialPages.length; + + // Setup: Mock refresh with all 304 responses + nock(TEST_BASE_URL) + .get("/") + .reply(304, undefined, { ETag: '"home-v1"' }) + .get("/page1") + .reply(304, undefined, { ETag: '"page1-v1"' }); + + // Execute refresh + const refreshJobId = await pipelineManager.enqueueRefreshJob(TEST_LIBRARY, TEST_VERSION); + await pipelineManager.waitForJobCompletion(refreshJobId); + + // Verify page count hasn't changed + const finalPages = await docService.getPagesByVersionId(versionId); + expect(finalPages.length).toBe(initialPageCount); + + // Verify content is still accessible + const search = await docService.searchStore(TEST_LIBRARY, TEST_VERSION, "stable", 10); + expect(search.length).toBeGreaterThan(0); + }, 30000); + + it("should discover and index new pages during refresh", async () => { + // Setup: Mock initial site with 2 pages + nock(TEST_BASE_URL) + .get("/") + .reply(200, "

Home

Page 1", { + "Content-Type": "text/html", + ETag: '"home-v1"', + }) + .get("/page1") + .reply(200, "

Page 1

Original page

", { + "Content-Type": "text/html", + ETag: '"page1-v1"', + }); + + // Initial scrape + const initialJobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + await pipelineManager.waitForJobCompletion(initialJobId); + + // Verify initial page count + const versionId = await docService.ensureVersion({ + library: TEST_LIBRARY, + version: TEST_VERSION, + }); + const initialPages = await docService.getPagesByVersionId(versionId); + expect(initialPages.length).toBe(2); // home, page1 + + // Setup: Mock refresh where home page now links to a new page2 + nock(TEST_BASE_URL) + .get("/") + .reply( + 200, + "

Home

Page 1Page 2", + { + "Content-Type": "text/html", + ETag: '"home-v2"', // Changed ETag + }, + ) + .get("/page1") + .reply(304, undefined, { ETag: '"page1-v1"' }) // Unchanged + .get("/page2") + .reply(200, "

Page 2

Newly added page

", { + "Content-Type": "text/html", + ETag: '"page2-v1"', + }); + + // Execute refresh + const refreshJobId = await pipelineManager.enqueueRefreshJob(TEST_LIBRARY, TEST_VERSION); + await pipelineManager.waitForJobCompletion(refreshJobId); + + // Verify new page was discovered and indexed + const finalPages = await docService.getPagesByVersionId(versionId); + expect(finalPages.length).toBe(3); // home, page1, page2 + + const page2 = finalPages.find((p) => p.url === `${TEST_BASE_URL}/page2`); + expect(page2).toBeDefined(); + + // Verify new page content is searchable + const search = await docService.searchStore(TEST_LIBRARY, TEST_VERSION, "newly added", 10); + expect(search.length).toBeGreaterThan(0); + }, 30000); + }); + + describe("Standard Scrape Error Handling", () => { + it("should gracefully handle 404 errors for broken links during normal scraping", async () => { + // Setup: Mock site with a broken link + nock(TEST_BASE_URL) + .get("/") + .reply( + 200, + "

Home

ValidBroken", + { + "Content-Type": "text/html", + ETag: '"home-v1"', + }, + ) + .get("/valid-page") + .reply(200, "

Valid Page

This page exists

", { + "Content-Type": "text/html", + ETag: '"valid-v1"', + }) + .get("/broken-link") + .reply(404); // Broken link! + + // Execute scrape + const jobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + // Should complete successfully despite the 404 + await pipelineManager.waitForJobCompletion(jobId); + + const job = await pipelineManager.getJob(jobId); + expect(job?.status).toBe("completed"); + + // Verify valid pages were indexed + const versionId = await docService.ensureVersion({ + library: TEST_LIBRARY, + version: TEST_VERSION, + }); + const pages = await docService.getPagesByVersionId(versionId); + + // Should have home and valid-page, but NOT broken-link + expect(pages.length).toBe(2); + const urls = pages.map((p) => p.url); + expect(urls).toContain(`${TEST_BASE_URL}/`); + expect(urls).toContain(`${TEST_BASE_URL}/valid-page`); + expect(urls).not.toContain(`${TEST_BASE_URL}/broken-link`); + + // Verify valid page content is searchable + const search = await docService.searchStore(TEST_LIBRARY, TEST_VERSION, "exists", 10); + expect(search.length).toBeGreaterThan(0); + }, 30000); + + it("should continue scraping after encountering multiple 404 errors", async () => { + // Setup: Mock site with multiple broken links interspersed with valid ones + nock(TEST_BASE_URL) + .get("/") + .reply( + 200, + "

Home

P1404P2404", + { + "Content-Type": "text/html", + ETag: '"home-v1"', + }, + ) + .get("/page1") + .reply(200, "

Page 1

", { + "Content-Type": "text/html", + ETag: '"page1-v1"', + }) + .get("/404-1") + .reply(404) + .get("/page2") + .reply(200, "

Page 2

", { + "Content-Type": "text/html", + ETag: '"page2-v1"', + }) + .get("/404-2") + .reply(404); + + // Execute scrape + const jobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + await pipelineManager.waitForJobCompletion(jobId); + + // Verify all valid pages were indexed despite multiple 404s + const versionId = await docService.ensureVersion({ + library: TEST_LIBRARY, + version: TEST_VERSION, + }); + const pages = await docService.getPagesByVersionId(versionId); + + expect(pages.length).toBe(3); // home, page1, page2 + const urls = pages.map((p) => p.url); + expect(urls).toContain(`${TEST_BASE_URL}/`); + expect(urls).toContain(`${TEST_BASE_URL}/page1`); + expect(urls).toContain(`${TEST_BASE_URL}/page2`); + }, 30000); + }); + + describe("Resiliency", () => { + it("should handle network timeouts gracefully and continue processing other pages", async () => { + // Setup: Mock initial site where one page times out + nock(TEST_BASE_URL) + .get("/") + .reply( + 200, + "

Home

Page 1Timeout", + { + "Content-Type": "text/html", + ETag: '"home-v1"', + }, + ) + .get("/page1") + .reply(200, "

Page 1

Working page

", { + "Content-Type": "text/html", + ETag: '"page1-v1"', + }) + .get("/timeout-page") + .delayConnection(30000) // Simulate timeout + .reply(200, "Should never reach this"); + + // Execute scrape - should complete despite timeout + const jobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + await pipelineManager.waitForJobCompletion(jobId); + + // Verify that the working pages were indexed despite the timeout + const versionId = await docService.ensureVersion({ + library: TEST_LIBRARY, + version: TEST_VERSION, + }); + const pages = await docService.getPagesByVersionId(versionId); + + // Should have home and page1, but timeout-page should have failed + expect(pages.length).toBeGreaterThanOrEqual(2); + const urls = pages.map((p) => p.url); + expect(urls).toContain(`${TEST_BASE_URL}/`); + expect(urls).toContain(`${TEST_BASE_URL}/page1`); + + // Verify working page content is searchable + const search = await docService.searchStore(TEST_LIBRARY, TEST_VERSION, "working page", 10); + expect(search.length).toBeGreaterThan(0); + }, 60000); + + it("should follow redirects and use the final URL for indexing", async () => { + // Setup: Mock site with redirect + nock(TEST_BASE_URL) + .get("/") + .reply( + 200, + "

Home

Old Link", + { + "Content-Type": "text/html", + ETag: '"home-v1"', + }, + ) + .get("/old-url") + .reply(301, undefined, { + Location: `${TEST_BASE_URL}/new-url`, + }) + .get("/new-url") + .reply(200, "

New Page

Redirected content

", { + "Content-Type": "text/html", + ETag: '"new-v1"', + }); + + // Execute scrape + const jobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + await pipelineManager.waitForJobCompletion(jobId); + + // Verify pages were indexed + const versionId = await docService.ensureVersion({ + library: TEST_LIBRARY, + version: TEST_VERSION, + }); + const pages = await docService.getPagesByVersionId(versionId); + + // Should have indexed with the final (redirected) URL + const urls = pages.map((p) => p.url); + expect(urls).toContain(`${TEST_BASE_URL}/`); + expect(urls).toContain(`${TEST_BASE_URL}/new-url`); + + // Verify content from redirected page is searchable + const search = await docService.searchStore( + TEST_LIBRARY, + TEST_VERSION, + "redirected content", + 10, + ); + expect(search.length).toBeGreaterThan(0); + }, 30000); + + it("should handle redirect chains during refresh and update canonical URLs", async () => { + // Setup: Initial scrape with direct URL + nock(TEST_BASE_URL) + .get("/") + .reply(200, "

Home

Page", { + "Content-Type": "text/html", + ETag: '"home-v1"', + }) + .get("/page1") + .reply(200, "

Page 1

Original location

", { + "Content-Type": "text/html", + ETag: '"page1-v1"', + }); + + const initialJobId = await pipelineManager.enqueueScrapeJob(TEST_LIBRARY, TEST_VERSION, { + url: `${TEST_BASE_URL}/`, + library: TEST_LIBRARY, + version: TEST_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions); + + await pipelineManager.waitForJobCompletion(initialJobId); + + // Setup: Refresh where page1 now redirects to a new location + nock(TEST_BASE_URL) + .get("/") + .reply(304, undefined, { ETag: '"home-v1"' }) + .get("/page1") + .reply(301, undefined, { + Location: `${TEST_BASE_URL}/page1-new`, + }) + .get("/page1-new") + .reply(200, "

Page 1 New

New location

", { + "Content-Type": "text/html", + ETag: '"page1-new-v1"', + }); + + // Execute refresh + const refreshJobId = await pipelineManager.enqueueRefreshJob(TEST_LIBRARY, TEST_VERSION); + await pipelineManager.waitForJobCompletion(refreshJobId); + + // Verify the canonical URL was updated + const versionId = await docService.ensureVersion({ + library: TEST_LIBRARY, + version: TEST_VERSION, + }); + const pages = await docService.getPagesByVersionId(versionId); + const urls = pages.map((p) => p.url); + + // Should now have the new URL + expect(urls).toContain(`${TEST_BASE_URL}/page1-new`); + + // Verify content from new location is searchable + const search = await docService.searchStore(TEST_LIBRARY, TEST_VERSION, "new location", 10); + expect(search.length).toBeGreaterThan(0); + }, 30000); + }); + + describe("File-Based Refresh Scenarios", () => { + const TEST_FILE_BASE = "/test-docs"; + const TEST_FILE_LIBRARY = "file-lib"; + const TEST_FILE_VERSION = "1.0.0"; + + beforeEach(() => { + vol.reset(); + }); + + it("should detect new files, modified files, and deleted files during refresh", async () => { + // Setup: Create initial file structure + vol.fromJSON({ + [`${TEST_FILE_BASE}/index.md`]: "# Home\nWelcome to the docs", + [`${TEST_FILE_BASE}/guide.md`]: "# Guide\nOriginal guide content", + [`${TEST_FILE_BASE}/api.md`]: "# API\nAPI documentation", + }); + + // Initial scrape - point to the directory to discover all files + const initialJobId = await pipelineManager.enqueueScrapeJob( + TEST_FILE_LIBRARY, + TEST_FILE_VERSION, + { + url: `file://${TEST_FILE_BASE}`, + library: TEST_FILE_LIBRARY, + version: TEST_FILE_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions, + ); + + await pipelineManager.waitForJobCompletion(initialJobId); + + // Verify initial files were indexed + const versionId = await docService.ensureVersion({ + library: TEST_FILE_LIBRARY, + version: TEST_FILE_VERSION, + }); + const initialPages = await docService.getPagesByVersionId(versionId); + expect(initialPages.length).toBe(3); // index.md, guide.md, api.md + + // Modify file structure: + // 1. Delete api.md + // 2. Modify guide.md + // 3. Add new tutorial.md + vol.reset(); + vol.fromJSON({ + [`${TEST_FILE_BASE}/index.md`]: "# Home\nWelcome to the docs", + [`${TEST_FILE_BASE}/guide.md`]: "# Guide\nUpdated guide content with new information", + [`${TEST_FILE_BASE}/tutorial.md`]: "# Tutorial\nStep-by-step tutorial", + }); + + // Wait a bit to ensure file modification times change + await new Promise((resolve) => setTimeout(resolve, 100)); + + // Execute refresh + const refreshJobId = await pipelineManager.enqueueRefreshJob( + TEST_FILE_LIBRARY, + TEST_FILE_VERSION, + ); + await pipelineManager.waitForJobCompletion(refreshJobId); + + // Verify final state + const finalPages = await docService.getPagesByVersionId(versionId); + const finalUrls = finalPages.map((p) => p.url); + + // Should have index, guide, tutorial (but not api) + expect(finalPages.length).toBe(3); + expect(finalUrls).toContain(`file://${TEST_FILE_BASE}/index.md`); + expect(finalUrls).toContain(`file://${TEST_FILE_BASE}/guide.md`); + expect(finalUrls).toContain(`file://${TEST_FILE_BASE}/tutorial.md`); + expect(finalUrls).not.toContain(`file://${TEST_FILE_BASE}/api.md`); + + // Verify modified content is searchable + const modifiedSearch = await docService.searchStore( + TEST_FILE_LIBRARY, + TEST_FILE_VERSION, + "updated guide content", + 10, + ); + expect(modifiedSearch.length).toBeGreaterThan(0); + + // Verify new file content is searchable + const newSearch = await docService.searchStore( + TEST_FILE_LIBRARY, + TEST_FILE_VERSION, + "step-by-step tutorial", + 10, + ); + expect(newSearch.length).toBeGreaterThan(0); + + // Verify deleted file content is no longer searchable + const deletedSearch = await docService.searchStore( + TEST_FILE_LIBRARY, + TEST_FILE_VERSION, + "API documentation", + 10, + ); + const hasDeletedContent = deletedSearch.some( + (r: StoreSearchResult) => r.url === `file://${TEST_FILE_BASE}/api.md`, + ); + expect(hasDeletedContent).toBe(false); + }, 30000); + + it("should handle unchanged files efficiently during file-based refresh", async () => { + // Setup: Create file structure + vol.fromJSON({ + [`${TEST_FILE_BASE}/doc1.md`]: "# Document 1\nStable content", + [`${TEST_FILE_BASE}/doc2.md`]: "# Document 2\nStable content", + }); + + // Initial scrape - point to the directory to discover all files + const initialJobId = await pipelineManager.enqueueScrapeJob( + TEST_FILE_LIBRARY, + TEST_FILE_VERSION, + { + url: `file://${TEST_FILE_BASE}`, + library: TEST_FILE_LIBRARY, + version: TEST_FILE_VERSION, + maxPages: 10, + maxDepth: 2, + } satisfies ScraperOptions, + ); + + await pipelineManager.waitForJobCompletion(initialJobId); + + const versionId = await docService.ensureVersion({ + library: TEST_FILE_LIBRARY, + version: TEST_FILE_VERSION, + }); + const initialPages = await docService.getPagesByVersionId(versionId); + const initialPageCount = initialPages.length; + + // Execute refresh without modifying files + const refreshJobId = await pipelineManager.enqueueRefreshJob( + TEST_FILE_LIBRARY, + TEST_FILE_VERSION, + ); + await pipelineManager.waitForJobCompletion(refreshJobId); + + // Verify page count hasn't changed + const finalPages = await docService.getPagesByVersionId(versionId); + expect(finalPages.length).toBe(initialPageCount); + + // Verify content is still searchable + const search = await docService.searchStore( + TEST_FILE_LIBRARY, + TEST_FILE_VERSION, + "stable content", + 10, + ); + expect(search.length).toBeGreaterThan(0); + }, 30000); + }); +}); diff --git a/test/vector-search-e2e.test.ts b/test/vector-search-e2e.test.ts index d0e508cb..66755638 100644 --- a/test/vector-search-e2e.test.ts +++ b/test/vector-search-e2e.test.ts @@ -212,7 +212,7 @@ describe("Vector Search End-to-End Tests", () => { library: "non-existent-library", version: "1.0.0", query: "test query", - })).rejects.toThrow("Library 'non-existent-library' not found"); + })).rejects.toThrow("Library non-existent-library not found in store. Did you mean: test-library?"); }, 10000); it("should handle non-existent version searches gracefully", async () => { diff --git a/tsconfig.test.json b/tsconfig.test.json new file mode 100644 index 00000000..631f88a4 --- /dev/null +++ b/tsconfig.test.json @@ -0,0 +1,5 @@ +{ + "extends": "./tsconfig.json", + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +}