index.js

import crypto from "crypto";
const URI_PARSE =
  /^(?:([^:\/?#]+):)?(?:\/\/((?:([^\/?#@]*)@)?(\[[^\/?#\]]+\]|[^\/?#:]*)(?:\:(\d*))?))?([^?#]*)(?:\?([^#]*))?(?:#((?:.|\n|\r)*))?/i; // regex for tokenising url from urijs module

const token = "%[a-f0-9]{2}";
const singleMatcher = new RegExp(token, "gi");
const multiMatcher = new RegExp("(" + token + ")+", "gi");

function decodeComponents(components, split) {
  try {
    // Try to decode the entire string first
    return [decodeURIComponent(components.join(""))];
  } catch (err) {
    // Do nothing
  }

  if (components.length === 1) {
    return components;
  }

  split = split || 1;

  // Split the array in 2 parts
  var left = components.slice(0, split);
  var right = components.slice(split);

  return Array.prototype.concat.call(
    [],
    decodeComponents(left),
    decodeComponents(right)
  );
}

function decode(input) {
  try {
    return decodeURIComponent(input);
  } catch (err) {
    var tokens = input.match(singleMatcher);

    for (var i = 1; i < (tokens ?? []).length; i++) {
      input = decodeComponents(tokens, i).join("");

      tokens = input.match(singleMatcher);
    }

    return input;
  }
}

function customDecodeURIComponent(input) {
  // Keep track of all the replacements and prefill the map with the `BOM`
  var replaceMap = {
    "%FE%FF": "\uFFFD\uFFFD",
    "%FF%FE": "\uFFFD\uFFFD",
  };

  var match = multiMatcher.exec(input);
  while (match) {
    try {
      // Decode as big chunks as possible
      replaceMap[match[0]] = decodeURIComponent(match[0]);
    } catch (err) {
      var result = decode(match[0]);

      if (result !== match[0]) {
        replaceMap[match[0]] = result;
      }
    }

    match = multiMatcher.exec(input);
  }

  // Add `%C2` at the end of the map to make sure it does not replace the combinator before everything else
  replaceMap["%C2"] = "\uFFFD";

  var entries = Object.keys(replaceMap);

  for (var i = 0; i < entries.length; i++) {
    // Replace all decoded components
    var key = entries[i];
    input = input.replace(new RegExp(key, "g"), replaceMap[key]);
  }

  return input;
}

function int2ip(ipInt) {
  return (
    (ipInt >>> 24) +
    "." +
    ((ipInt >> 16) & 255) +
    "." +
    ((ipInt >> 8) & 255) +
    "." +
    (ipInt & 255)
  );
}

function escapeCharacter(code) {
  const chr = String.fromCharCode(code);
  if (code < 16) return "%0" + code.toString(16);
  else if (
    (code <= 32 || code > 127 || chr === "%" || chr === "#") &&
    code < 256
  )
    return "%" + code.toString(16);
  else if (code >= 256) {
    return escapeCharacter(code >> 8) + escapeCharacter(code % 256);
  }
  return chr;
}

function webriskURIEscape(c) {
  return c.replace(/./g, (chr) => escapeCharacter(chr.charCodeAt(0)));
}

const normalizeIPAddress = (c) => {
  if (c.match(/^\d+$/)) {
    return int2ip(c);
  }
  return c;
};

const normalizeComponentEncoding = (c) => {
  let value = c;
  let prevValue;
  for (
    let infiniteLoopPreventor = 0;
    infiniteLoopPreventor < 1000;
    infiniteLoopPreventor++
  ) {
    prevValue = value;
    value = customDecodeURIComponent(prevValue).replace(/[\t\x0a\x0d]/g, "");
    if (value === prevValue) {
      break;
    }
  }
  return webriskURIEscape(value);
};

const normalizeDotsInPaths = (path) => {
  const pathParts = path
    .split("/")
    .filter(
      (el, index, arr) =>
        (el != "" || index === 0 || index === arr.length - 1) && el != "."
    )
    .filter((el, index, arr) => el != ".." && arr[index + 1] !== "..");
  return pathParts.join("/") || "/";
};

export const canonicalize = function (url) {
  const urlWithScheme = url.includes("://")
    ? url.trim()
    : "http://" + url.trim();
  const [, schema, , userinfo, host, , path, query] =
    urlWithScheme.match(URI_PARSE);
  if (!schema || !host || host.length > 255) {
    return null;
  }
  const normalizedHost = normalizeComponentEncoding(normalizeIPAddress(host))
    .replace(/\.+$/, "")
    .toLowerCase();
  const normalizedPath = normalizeComponentEncoding(normalizeDotsInPaths(path));
  const normalizedQuery = query !== undefined ? `?${query}` : "";
  
  return `${schema}://${normalizedHost}${normalizedPath}${normalizedQuery}`;
};

export const suffixPostfixExpressions = function (canonicalURL) {
  const [, schema, , userinfo, host, , path, query, fragment] =
    canonicalURL.match(URI_PARSE);
  const fullExpression = host + path;
  let iDomain = host;
  const res = [];
  while (iDomain.match(/.*\..*/) && !iDomain.match(/^(\d+\.){2}\d+$/)) {
    const domainRes = [];
    if (query) {
      domainRes.push(iDomain + path + "?" + query);
    }
    res.push(domainRes);
    let iPath = path;
    while (iPath.match(/\/.+/)) {
      domainRes.push(iDomain + iPath);
      iPath = iPath.replace(/[^/]*\/?$/, "");
    }
    domainRes.push(iDomain + "/");
    domainRes.splice(1, domainRes.length - 6);
    iDomain = iDomain.replace(/^.*?\./, "");
  }
  res.splice(1, res.length - 5);
  return new Set([].concat(...res));
};

export const truncatedSha256Prefix = (str, bits) => {
  const hash = crypto.createHash("sha256").update(str).digest();
  return hash.subarray(0, bits / 8);
};

export const getPrefixMap = (url, size = 32 * 8) => {
  const canonical = canonicalize(url);
  return Array.from(suffixPostfixExpressions(canonical)).map((url) => [
    url,
    truncatedSha256Prefix(url, size),
  ]);
};

export const getPrefixes = (url, size = 32 * 8) => {
  const canonical = canonicalize(url);
  if (canonical === null) {
    return new Set();
  }
  return new Set(
    Array.from(suffixPostfixExpressions(canonical)).map((url) =>
      truncatedSha256Prefix(url, size)
    )
  );
};