Skip to content

Commit

Permalink
Clean up
Browse files Browse the repository at this point in the history
  • Loading branch information
jmduke committed Sep 22, 2024
1 parent c025fd8 commit dd122a3
Show file tree
Hide file tree
Showing 4 changed files with 210 additions and 156 deletions.
166 changes: 89 additions & 77 deletions lib/data.test.ts
Original file line number Diff line number Diff line change
@@ -1,90 +1,102 @@
import fetch from "@/lib/data";
import { describe, expect, test } from "bun:test";
import { DetectedTechnology } from "./parsers/types";
import type { DetectedTechnology } from "./parsers/types";

const DOMAIN_TO_UNEXPECTED_DATA: Record<string, DetectedTechnology[]> = {
"changelog.com": [
{
identifier: "subdomain",
metadata: {
value: "op3.dev",
},
},
],
"changelog.com": [
{
identifier: "subdomain",
metadata: {
value: "op3.dev",
},
},
],
};

const DOMAIN_TO_EXPECTED_DATA: Record<string, DetectedTechnology[]> = {
"formkeep.com": [
{
identifier: "github",
metadata: {
username: "formkeep.js",
},
},
{
identifier: "linkedin",
metadata: { username: "formkeep" },
},
],
"savvycal.com": [
{
identifier: "twitter",
metadata: { username: "savvycal" },
},
{
identifier: "rss",
metadata: { url: "https://savvycal.com/feed.xml" },
},
{
identifier: "rewardful",
metadata: { value: "rewardful", via: "URL" },
},
],
"buttondown.email": [
{
identifier: "github",
metadata: { username: "buttondown" },
},
],
"zed.dev": [
{
identifier: "twitter",
metadata: { username: "zeddotdev" },
},
],
"bytereview.co.uk": [
{
identifier: "tiktok",
metadata: { username: "@bytereview" },
},
{
identifier: "twitter",
metadata: { username: "bytereview" },
},
],
"lastwatchdog.com": [
{
identifier: "rss",
metadata: {
url: "https://www.lastwatchdog.com/feed/",
},
},
],
"formkeep.com": [
{
identifier: "github",
metadata: {
username: "formkeep.js",
},
},
{
identifier: "linkedin",
metadata: { username: "formkeep" },
},
],
"savvycal.com": [
{
identifier: "twitter",
metadata: { username: "savvycal" },
},
{
identifier: "rss",
metadata: { url: "https://savvycal.com/feed.xml" },
},
{
identifier: "rewardful",
metadata: { value: "rewardful", via: "URL" },
},
],
"buttondown.email": [
{
identifier: "github",
metadata: { username: "buttondown" },
},
],
"zed.dev": [
{
identifier: "twitter",
metadata: { username: "zeddotdev" },
},
],
"bytereview.co.uk": [
{
identifier: "tiktok",
metadata: { username: "@bytereview" },
},
{
identifier: "twitter",
metadata: { username: "bytereview" },
},
],
};

describe("fetching", () => {
Object.entries(DOMAIN_TO_EXPECTED_DATA).forEach(([domain, expectedData]) => {
expectedData.forEach((data) => {
test(`fetches ${data.identifier} for ${domain}`, async () => {
const { detected_technologies } = await fetch(domain);
expect(detected_technologies).toContainEqual(data);
});
});
});
Object.entries(DOMAIN_TO_EXPECTED_DATA).forEach(([domain, expectedData]) => {
expectedData.forEach((data) => {
test(`fetches ${data.identifier} for ${domain}`, async () => {
const { detected_technologies } = await fetch(domain);
expect(detected_technologies).toContainEqual(data);
});
});
});

Object.entries(DOMAIN_TO_UNEXPECTED_DATA).forEach(([domain, unexpectedData]) => {
unexpectedData.forEach((data) => {
test(`does not fetch ${data.identifier} for ${domain}`, async () => {
const { detected_technologies } = await fetch(domain);
expect(detected_technologies).not.toContainEqual(data);
});
});
});
Object.entries(DOMAIN_TO_UNEXPECTED_DATA).forEach(
([domain, unexpectedData]) => {
unexpectedData.forEach((data) => {
test(`does not fetch ${data.identifier} for ${domain}`, async () => {
const { detected_technologies } = await fetch(domain);
expect(detected_technologies).not.toContainEqual(data);
});
});
},
);

test("deduping identical records", async () => {
const { detected_technologies } = await fetch("zed.dev");
expect(detected_technologies.filter((tech) => tech.identifier === "twitter")).toHaveLength(1);
});
test("deduping identical records", async () => {
const { detected_technologies } = await fetch("zed.dev");
expect(
detected_technologies.filter((tech) => tech.identifier === "twitter"),
).toHaveLength(1);
});
});
68 changes: 34 additions & 34 deletions lib/db/domains.ts
Original file line number Diff line number Diff line change
@@ -1,41 +1,41 @@
import fetch from "@/lib/data";
import type fetch from "@/lib/data";
import { db } from "@/lib/db/connection";

export const reify = async (domain: string, data: Awaited<ReturnType<typeof fetch>>) => {
await db
.insertInto("domains")
.values({
domain: domain,
data: JSON.stringify(data),
})
.execute();
export const reify = async (
domain: string,
data: Awaited<ReturnType<typeof fetch>>,
) => {
await db
.insertInto("domains")
.values({
domain: domain,
data: JSON.stringify(data),
})
.execute();

const existingTechnologies = await db
.selectFrom("detected_technologies")
.select("technology")
.where("domain", "=", domain)
.execute();
const existingTechnologies = await db
.selectFrom("detected_technologies")
.select("technology")
.where("domain", "=", domain)
.execute();

const existingTechSet = new Set(
existingTechnologies.map((tech) => tech.technology)
);
const existingTechSet = new Set(
existingTechnologies.map((tech) => tech.technology),
);

const newTechnologies = data.detected_technologies
.filter(
(technology) =>
!existingTechSet.has(technology.identifier)
)
.map((technology) => ({
domain: domain,
technology: technology.identifier,
data: JSON.stringify(technology.metadata),
creation_date: new Date().toISOString(),
}));
const newTechnologies = data.detected_technologies
.filter((technology) => !existingTechSet.has(technology.identifier))
.map((technology) => ({
domain: domain,
technology: technology.identifier,
data: JSON.stringify(technology.metadata),
creation_date: new Date().toISOString(),
}));

if (newTechnologies.length > 0) {
await db
.insertInto("detected_technologies")
.values(newTechnologies)
.execute();
}
if (newTechnologies.length > 0) {
await db
.insertInto("detected_technologies")
.values(newTechnologies)
.execute();
}
};
100 changes: 55 additions & 45 deletions lib/parsers/html.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,44 +93,49 @@ const JSONLD_RULE = (html: string) => {
const tag = parseHTML(html).querySelector(
"script[type='application/ld+json']",
);
if (tag) {
const text = tag.text;
const baseRule = [
{
identifier: "jsonld",
metadata: { value: text },
},
...((() => {
try {
return JSON.parse(text);
} catch (error) {
console.error("Error parsing JSON-LD:", error);
return {};
}
})()
["@graph"]?.filter((i: { sameAs: string[] }) => i.sameAs)
.flatMap((i: any) => {
return i.sameAs.flatMap((url: string) => {
const service = Object.values(REGISTRY).find((service) =>
url.includes(service.urlSubstrings?.[0] || ""),
);
if (!service) {
return [];
}
return [
{
identifier: service.identifier.split("?")[0],
metadata: {
username: url.split("/").pop(),
},
},
];
});
}) || []),
];
return baseRule;
if (!tag) {
return [];
}
return [];
const text = tag.text;
const baseRule = [
{
identifier: "jsonld",
metadata: { value: text },
},
];

try {
const parsedJson = JSON.parse(text);
const graph = Array.isArray(parsedJson) ? parsedJson : parsedJson["@graph"];

if (Array.isArray(graph)) {
const additionalRules = graph
.filter((item) => item && Array.isArray(item.sameAs))
.flatMap((item) =>
item.sameAs
.map((url: string) => {
const service = Object.values(REGISTRY).find((s) =>
url.includes(s.urlSubstrings?.[0] || ""),
);
if (service) {
return {
identifier: service.identifier.split("?")[0],
metadata: {
username: url.split("/").pop() || "",
},
};
}
return null;
})
.filter(Boolean),
);

baseRule.push(...additionalRules);
}
} catch (error) {
console.error("Error parsing or processing JSON-LD:", error);
}
return baseRule;
};

const RSS_RULE = (html: string): DetectedTechnology[] => {
Expand Down Expand Up @@ -159,20 +164,25 @@ const RSS_RULE = (html: string): DetectedTechnology[] => {
return [];
};

const isValidSubdomain = (potentialValue: string, domain: string) => {
if (!potentialValue.startsWith("http")) {
return false;
}
try {
const url = new URL(potentialValue);
return url.hostname.includes(domain) && url.hostname !== `www.${domain}`;
} catch (error) {
return false;
}
};

const SUBDOMAIN_RULE = (html: string, domain: string) => {
const subdomains = parseHTML(html)
.querySelectorAll("a")
.map((a) => ({
value: a.getAttribute("href"),
}))
.filter(
(v) =>
v.value &&
v.value.startsWith("http") &&
new URL(v.value).hostname.includes(domain) &&
new URL(v.value).hostname !== "www." + domain &&
new URL(v.value).hostname !== domain,
)
.filter((v) => isValidSubdomain(v.value || "", domain))
.map((v) => ({
value: new URL(v.value || "").hostname,
}))
Expand Down
Loading

0 comments on commit dd122a3

Please sign in to comment.