Skip to content

Commit 203a561

Browse files
committed
feat(scraper): implement efficient version refresh using ETags
This commit introduces a version refresh feature that enables efficient re-indexing of previously scraped library versions. By leveraging HTTP ETags, the new mechanism avoids re-processing unchanged pages, significantly reducing processing time, bandwidth usage, and embedding costs. ### Problem Previously, re-indexing a library version was a wasteful process that required deleting all existing data and re-scraping every page from scratch, even if most of the content was unchanged. ### Solution This implementation introduces a refresh mechanism that re-visits all previously scraped pages with their stored ETags. This allows the scraper to: - **Skip** unchanged pages (HTTP 304 Not Modified). - **Re-process** only pages that have changed (HTTP 200 OK). - **Delete** documents for pages that are no longer available (HTTP 404 Not Found). ### Key Architectural Changes - **Reused Existing Scraper Infrastructure**: The refresh operation is a standard scrape job with a pre-populated `initialQueue`. This leverages existing logic for progress tracking, error handling, and state management. - **Database Schema Update**: A `depth` column has been added to the `pages` table to ensure that refresh operations respect the original `maxDepth` constraints. A database migration (`010-add-depth-to-pages.sql`) is included to apply this change. - **Conditional Fetching**: The scraper's `processItem` logic has been updated to handle conditional requests. It now correctly processes 304, 404, and 200 HTTP responses to either skip, delete, or update documents. - **Pipeline Manager Integration**: A new `enqueueRefreshJob` method was added to the `PipelineManager` to orchestrate the refresh process by fetching pages from the database and populating the `initialQueue`.
1 parent 9fc6cf9 commit 203a561

File tree

87 files changed

+3853
-2744
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

87 files changed

+3853
-2744
lines changed
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
-- Migration 010: Add depth column to pages table for refresh functionality
2+
-- This enables tracking the original crawl depth of each page, which is essential
3+
-- for maintaining consistent depth constraints during refresh operations.
4+
5+
-- Add depth column to pages table
6+
ALTER TABLE pages ADD COLUMN depth INTEGER;
7+
8+
-- Backfill existing pages with depth 0 (conservative default)
9+
-- This ensures all existing pages have a valid depth value
10+
UPDATE pages SET depth = 0 WHERE depth IS NULL;

src/cli/utils.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ export async function createPipelineWithCallbacks(
173173
): Promise<IPipeline> {
174174
logger.debug(`Initializing pipeline with options: ${JSON.stringify(options)}`);
175175
const { serverUrl, ...rest } = options;
176-
const pipeline = serverUrl
176+
const pipeline: IPipeline = serverUrl
177177
? await PipelineFactory.createPipeline(undefined, { serverUrl, ...rest })
178178
: await (async () => {
179179
if (!docService) {
@@ -194,7 +194,7 @@ export async function createPipelineWithCallbacks(
194194
},
195195
onJobError: async (job, error, document) => {
196196
logger.warn(
197-
`⚠️ Job ${job.id} error ${document ? `on document ${document.metadata.url}` : ""}: ${error.message}`,
197+
`⚠️ Job ${job.id} error ${document ? `on document ${document.url}` : ""}: ${error.message}`,
198198
);
199199
},
200200
});

src/pipeline/PipelineClient.test.ts

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ vi.mock("../utils/logger");
66
// Mock tRPC client factory
77
const mockClient: any = {
88
ping: { query: vi.fn() },
9-
enqueueJob: { mutate: vi.fn() },
9+
enqueueScrapeJob: { mutate: vi.fn() },
10+
enqueueRefreshJob: { mutate: vi.fn() },
1011
getJob: { query: vi.fn() },
1112
getJobs: { query: vi.fn() },
1213
cancelJob: { mutate: vi.fn() },
@@ -28,7 +29,8 @@ describe("PipelineClient", () => {
2829
vi.resetAllMocks();
2930
// Reset default mock behaviors
3031
mockClient.ping.query.mockResolvedValue({ status: "ok" });
31-
mockClient.enqueueJob.mutate.mockResolvedValue({ jobId: "job-123" });
32+
mockClient.enqueueScrapeJob.mutate.mockResolvedValue({ jobId: "job-123" });
33+
mockClient.enqueueRefreshJob.mutate.mockResolvedValue({ jobId: "job-456" });
3234
mockClient.getJob.query.mockResolvedValue(undefined);
3335
mockClient.getJobs.query.mockResolvedValue({ jobs: [] });
3436
mockClient.cancelJob.mutate.mockResolvedValue({ success: true });
@@ -50,18 +52,18 @@ describe("PipelineClient", () => {
5052
});
5153
});
5254

53-
describe("enqueueJob", () => {
55+
describe("enqueueScrapeJob", () => {
5456
it("should delegate job creation to external API", async () => {
5557
const mockJobId = "job-123";
56-
mockClient.enqueueJob.mutate.mockResolvedValueOnce({ jobId: mockJobId });
57-
const jobId = await client.enqueueJob("react", "18.0.0", {
58+
mockClient.enqueueScrapeJob.mutate.mockResolvedValueOnce({ jobId: mockJobId });
59+
const jobId = await client.enqueueScrapeJob("react", "18.0.0", {
5860
url: "https://react.dev",
5961
library: "react",
6062
version: "18.0.0",
6163
});
6264

6365
expect(jobId).toBe(mockJobId);
64-
expect(mockClient.enqueueJob.mutate).toHaveBeenCalledWith({
66+
expect(mockClient.enqueueScrapeJob.mutate).toHaveBeenCalledWith({
6567
library: "react",
6668
version: "18.0.0",
6769
options: {
@@ -73,9 +75,9 @@ describe("PipelineClient", () => {
7375
});
7476

7577
it("should handle API errors gracefully", async () => {
76-
mockClient.enqueueJob.mutate.mockRejectedValueOnce(new Error("Bad request"));
78+
mockClient.enqueueScrapeJob.mutate.mockRejectedValueOnce(new Error("Bad request"));
7779

78-
await expect(client.enqueueJob("invalid", null, {} as any)).rejects.toThrow(
80+
await expect(client.enqueueScrapeJob("invalid", null, {} as any)).rejects.toThrow(
7981
"Failed to enqueue job: Bad request",
8082
);
8183
});

src/pipeline/PipelineClient.ts

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ export class PipelineClient implements IPipeline {
6868
logger.debug("PipelineClient stopped");
6969
}
7070

71-
async enqueueJob(
71+
async enqueueScrapeJob(
7272
library: string,
7373
version: string | undefined | null,
7474
options: ScraperOptions,
@@ -78,7 +78,7 @@ export class PipelineClient implements IPipeline {
7878
typeof version === "string" && version.trim().length === 0
7979
? null
8080
: (version ?? null);
81-
const result = await this.client.enqueueJob.mutate({
81+
const result = await this.client.enqueueScrapeJob.mutate({
8282
library,
8383
version: normalizedVersion,
8484
options,
@@ -92,6 +92,28 @@ export class PipelineClient implements IPipeline {
9292
}
9393
}
9494

95+
async enqueueRefreshJob(
96+
library: string,
97+
version: string | undefined | null,
98+
): Promise<string> {
99+
try {
100+
const normalizedVersion =
101+
typeof version === "string" && version.trim().length === 0
102+
? null
103+
: (version ?? null);
104+
const result = await this.client.enqueueRefreshJob.mutate({
105+
library,
106+
version: normalizedVersion,
107+
});
108+
logger.debug(`Refresh job ${result.jobId} enqueued successfully`);
109+
return result.jobId;
110+
} catch (error) {
111+
throw new Error(
112+
`Failed to enqueue refresh job: ${error instanceof Error ? error.message : String(error)}`,
113+
);
114+
}
115+
}
116+
95117
async getJob(jobId: string): Promise<PipelineJob | undefined> {
96118
try {
97119
const serializedJob = await this.client.getJob.query({ id: jobId });

src/pipeline/PipelineFactory.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,7 @@ export namespace PipelineFactory {
2424
// Overload: Remote pipeline client (out-of-process worker)
2525
export async function createPipeline(
2626
docService: undefined,
27-
options: Required<Pick<PipelineOptions, "serverUrl">> &
28-
Omit<PipelineOptions, "serverUrl">,
27+
options: PipelineOptions & { serverUrl: string },
2928
): Promise<PipelineClient>;
3029
// Implementation
3130
export async function createPipeline(

0 commit comments

Comments
 (0)