Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"js-levenshtein": "^1.1.6",
"js-yaml": "^4.1.0",
"minio": "^7.1.3",
"normalize-url": "^8.1.0",
"p-queue": "^7.3.4",
"pixelmatch": "^5.3.0",
"pngjs": "^7.0.0",
Expand Down
18 changes: 17 additions & 1 deletion src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import {
import { ScopedSeed } from "./seeds.js";
import { Frame } from "puppeteer-core";
import { interpolateFilename, UploadResult } from "./storage.js";
import normalizeUrl, { Options as NormamlizeUrlOptions } from "normalize-url";

// ============================================================================
export enum LoadState {
Expand All @@ -28,6 +29,20 @@ export enum QueueState {
DUPE_URL = 2,
}

// ============================================================================
const normalizeUrlOpts: NormamlizeUrlOptions = {
defaultProtocol: "https",
stripAuthentication: false,
stripTextFragment: false,
stripWWW: false,
stripHash: false,
removeTrailingSlash: false,
removeSingleSlash: false,
removeExplicitPort: false,
sortQueryParameters: true,
removePath: false,
};

// ============================================================================
// treat 0 or 206 as 200 for purposes of dedup
function normalizeDedupStatus(status: number): number {
Expand Down Expand Up @@ -673,7 +688,6 @@ return inx;
return res >= 3;
}

//async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) {
async addToQueue(
{
url,
Expand All @@ -685,6 +699,7 @@ return inx;
}: QueueEntry,
limit = 0,
) {
url = normalizeUrl(url, normalizeUrlOpts);
const added = this._timestamp();
const data: QueueEntry = { added, url, seedId, depth, extraHops };

Expand Down Expand Up @@ -1010,6 +1025,7 @@ return inx;
}

async addIfNoDupe(key: string, url: string, status: number) {
url = normalizeUrl(url, normalizeUrlOpts);
return (
(await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) ===
1
Expand Down
15 changes: 15 additions & 0 deletions tests/url-normalize.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import fs from "fs";
import child_process from "child_process";

test("ensure URLs with same query args but in different order considered same URL", async () => {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url 'https://example-com.webrecorder.net/?A=1&B=2' --url 'https://example-com.webrecorder.net/?B=2&A=1' --collection url-norm-1 --scopeType page");

// url is normalized, only 1 URL is crawled
// check pages.jsonl for 1 URL (+ 1 header)
expect(fs.readFileSync(
"test-crawls/collections/url-norm-1/pages/pages.jsonl", "utf8",
)
.trim()
.split("\n").length).toBe(1 + 1);
});

5 changes: 5 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4150,6 +4150,11 @@ normalize-path@^3.0.0:
resolved "https://registry.yarnpkg.com/normalize-path/-/normalize-path-3.0.0.tgz#0dcd69ff23a1c9b11fd0978316644a0388216a65"
integrity sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==

normalize-url@^8.1.0:
version "8.1.0"
resolved "https://registry.yarnpkg.com/normalize-url/-/normalize-url-8.1.0.tgz#d33504f67970decf612946fd4880bc8c0983486d"
integrity sha512-X06Mfd/5aKsRHc0O0J5CUedwnPmnDtLF2+nq+KN9KSDlJHkPuh0JUviWjEWMe0SW/9TDdSLVPuk7L5gGTIA1/w==

npm-run-path@^4.0.1:
version "4.0.1"
resolved "https://registry.yarnpkg.com/npm-run-path/-/npm-run-path-4.0.1.tgz#b7ecd1e5ed53da8e37a55e1c2269e0b97ed748ea"
Expand Down
Loading