Skip to content

Commit

Permalink
update crawler sus output
Browse files Browse the repository at this point in the history
  • Loading branch information
tgxn committed Jan 3, 2025
1 parent 4519cfc commit 88fcc60
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 27 deletions.
7 changes: 6 additions & 1 deletion .vscode/extensions.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
{
"recommendations": ["ms-vscode.vscode-typescript-next"]
"recommendations": [
"ms-azuretools.vscode-docker",
"ms-vscode.vscode-typescript-next",
"ms-playwright.playwright",
"esbenp.prettier-vscode"
]
}
36 changes: 36 additions & 0 deletions crawler/src/lib/storage/fediseer.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,40 @@
import { CrawlStorage } from "../crawlStorage";

/**
* "visibility_endorsements": "OPEN",
"visibility_censures": "OPEN",
"visibility_hesitations": "OPEN",
"flags": [],
"id": 2337,
"domain": "soc.ottr.uk",
"software": "mastodon",
"claimed": 1,
"open_registrations": false,
"email_verify": null,
"approval_required": false,
"has_captcha": null,
"approvals": 8,
"endorsements": 1,
"guarantor": "ff.collins-corner.cc",
"censure_reasons": null,
"sysadmins": 1,
"moderators": 1,
"state": "UP",
"tags": [
"friends only",
"furry",
"nsfw allowed",
"hosted in eu",
"small instance"
]
},
*/

export type IFediseerTagData = {
tag: string;
rank: number;
};

export type IFediseerData = {
id: number;
domain: string;
Expand All @@ -10,6 +45,7 @@ export type IFediseerData = {
approvals: number;
endorsements: number;
guarantor: string;
tags?: IFediseerTagData[];
};

export default class Fediseer {
Expand Down
39 changes: 36 additions & 3 deletions crawler/src/output/output.ts
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,22 @@ export default class CrawlOutput {
// delete existing data from the output directory
await this.fileWriter.cleanData();

const susSiteList = this.trust.getSusInstances();
let susSiteList = this.trust.getSusInstances();

// remove sus sites not updated in 24h
susSiteList = susSiteList.filter((instance) => {
if (!instance.lastCrawled) return false; // record needs time

// remove communities with age more than the max
const recordAge = Date.now() - instance.lastCrawled;
if (recordAge > OUTPUT_MAX_AGE.INSTANCE) {
console.log("Sus Site too old", instance.base);
return false;
}

return true;
});

await this.fileWriter.storeSuspicousData(susSiteList);

const returnInstanceArray = await this.getInstanceArray();
Expand Down Expand Up @@ -627,11 +642,21 @@ export default class CrawlOutput {

const siteUptime = this.getBaseUrlUptime(siteBaseUrl);

if (!this.trust.blockedFederation) {
throw new Error("No blocked federation data");
}

// incoming blocks are fetched from the trust store
const incomingBlocks = this.trust.blockedFederation[siteBaseUrl] || 0;

// outgoign blocks come frrom siteData
const outgoingBlocks = instance.siteData.federated?.blocked?.length || 0;

// console.log("outgoingBlocks", outgoingBlocks);

const instanceTrustData = this.trust.getInstance(siteBaseUrl);

// console.log("instanceTrustData", instanceTrustData);
const score = instanceTrustData.score;

// ignore instances that have no data
Expand Down Expand Up @@ -759,7 +784,15 @@ export default class CrawlOutput {
// }

const relatedInstance = returnInstanceArray.find((instance) => instance.baseurl === siteBaseUrl);
const isInstanceSus: [] = relatedInstance?.trust || []; //await this.trust.getInstanceSusReasons(relatedInstance);
let isInstanceSus: boolean = false;
if (relatedInstance) {
isInstanceSus = relatedInstance.isSuspicious;
}

// if (siteBaseUrl.includes("zerobytes")) {
// console.log("siteBaseUrl", siteBaseUrl, isInstanceSus, relatedInstance);
// }
// const isInstanceSus: boolean = relatedInstance.isSuspicious || false; //await this.trust.getInstanceSusReasons(relatedInstance);

// // calculate community published time
// let publishTime = null;trust
Expand Down Expand Up @@ -794,7 +827,7 @@ export default class CrawlOutput {
published: OutputUtils.parseLemmyTimeToUnix(community.community?.published),
time: community.lastCrawled || 0,

isSuspicious: isInstanceSus.length > 0 ? true : false,
isSuspicious: isInstanceSus,
score: score,
};

Expand Down
101 changes: 78 additions & 23 deletions crawler/src/output/trust.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
import divinator from "divinator";

import storage from "../lib/crawlStorage";
import { IInstanceData, IInstanceDataKeyValue } from "../lib/storage/instance";
import { ICommunityData, ICommunityDataKeyValue } from "../lib/storage/community";
import { IMagazineData, IMagazineDataKeyValue } from "../lib/storage/kbin";
import { IFediverseData, IFediverseDataKeyValue } from "../lib/storage/fediverse";
import { IFediseerData } from "../lib/storage/fediseer";
import {
IErrorData,
IErrorDataKeyValue,
ILastCrawlData,
ILastCrawlDataKeyValue,
} from "../lib/storage/tracking";
import { IUptimeNodeData, IFullUptimeData } from "../lib/storage/uptime";

import { OUTPUT_MAX_AGE } from "../lib/const";

// used to calculate instance overall rating, as well as several instance and community metrics
// it is meant to take some of the trust assertion logic out of the main output script
Expand All @@ -26,14 +40,14 @@ import storage from "../lib/crawlStorage";

// create a new isntance for the overall output, and call methods on it
export default class OutputTrust {
private instanceList;
private instanceList: IInstanceData[] | null;

public fediseerData;
public fediseerData: IFediseerData[] | null = null;
public endorsements;

public linkedFederation;
public allowedFederation;
public blockedFederation;
public linkedFederation: { [key: string]: number } | null = null;
public allowedFederation: { [key: string]: number } | null = null;
public blockedFederation: { [key: string]: number } | null = null;

public instancesWithMetrics;
public allInstanceMetrics;
Expand All @@ -45,7 +59,7 @@ export default class OutputTrust {
}

// loads the initial data into the trust library
async setupSources(instanceList) {
async setupSources(instanceList: IInstanceData[]) {
this.instanceList = instanceList;

this.fediseerData = await storage.fediseer.getLatest();
Expand All @@ -64,26 +78,40 @@ export default class OutputTrust {
await this.getInstancesWithMetrics();
}

getAllInstanceEndorsements() {
const endorsements = {};
getAllInstanceEndorsements(): { [key: string]: number } {
if (!this.fediseerData) {
throw new Error("fediseerData not loaded in getAllInstanceEndorsements");
}

const endorsements: { [key: string]: number } = {};

this.fediseerData.forEach((instance) => {
if (instance.endorsements) {
endorsements[instance.domain] = instance.endorsements;
}
});

return endorsements;
}

async getInstancesWithMetrics() {
if (!this.instanceList) {
throw new Error("instanceList not loaded in getInstancesWithMetrics");
}

this.instancesWithMetrics = await Promise.all(
this.instanceList.map(async (instance) => {
const baseUrl = instance.siteData.site.actor_id.split("/")[2];

const instanceMetrics = await this.calculateInstanceMetrics(instance);

if (!this.fediseerData) {
throw new Error("fediseerData not loaded in getInstancesWithMetrics");
}

const instanceGuarantor = this.fediseerData.find((instance) => instance.domain === baseUrl);

let guarantor = null;
let guarantor: string | null = null;
if (instanceGuarantor !== undefined && instanceGuarantor.guarantor !== null) {
console.log(baseUrl, "instanceGuarantor", instanceGuarantor.guarantor);
guarantor = instanceGuarantor.guarantor;
Expand All @@ -92,6 +120,8 @@ export default class OutputTrust {
return {
// ...instance,

lastCrawled: instance.lastCrawled,

baseurl: baseUrl,
metrics: instanceMetrics,

Expand Down Expand Up @@ -139,7 +169,11 @@ export default class OutputTrust {
);
}

getInstanceTags(instance) {
getInstanceTags(instance): string[] {
if (!this.fediseerData) {
throw new Error("fediseerData not loaded in getInstanceTags");
}

let tags: string[] = [];

const baseUrl = instance.siteData.site.actor_id.split("/")[2];
Expand Down Expand Up @@ -406,11 +440,11 @@ export default class OutputTrust {
// return null;
// }

getInstance(baseUrl) {
const instanceGuarantor = this.instancesWithMetrics.find((instance) => instance.baseurl === baseUrl);
getInstance(baseUrl: string) {
const instanceTrustDetails = this.instancesWithMetrics.find((instance) => instance.baseurl === baseUrl);

if (instanceGuarantor) {
return instanceGuarantor;
if (instanceTrustDetails) {
return instanceTrustDetails;
}

return null;
Expand All @@ -432,10 +466,24 @@ export default class OutputTrust {
actor_id: instance.actor_id,
metrics: instance.metrics,
reasons: instance.reasons,
lastCrawled: instance.lastCrawled,
});
}
}

// remove instances not updated in 24h
// this.instanceList = this.instanceList.filter((instance) => {
// if (!instance.lastCrawled) return false; // record needs time

// // remove communities with age more than the max
// const recordAge = Date.now() - instance.lastCrawled;
// if (recordAge > OUTPUT_MAX_AGE.INSTANCE) {
// return false;
// }

// return true;
// });

return susInstances;
}

Expand Down Expand Up @@ -494,13 +542,13 @@ export default class OutputTrust {

// run domain through this to get scores
// this generates the "smart sort" score that is used by default
calcInstanceScore(baseUrl) {
calcInstanceScore(baseUrl: string) {
let score = 0;

const scores: any = {};

// having a linked instance gives you a point for each link
if (this.linkedFederation[baseUrl]) {
if (this.linkedFederation && this.linkedFederation[baseUrl]) {
scores.linked = this.linkedFederation[baseUrl] / 2;
score += scores.linked;
}
Expand All @@ -512,13 +560,13 @@ export default class OutputTrust {
}

// each allowed instance gives you points
if (this.allowedFederation[baseUrl]) {
if (this.allowedFederation && this.allowedFederation[baseUrl]) {
scores.allowed = this.allowedFederation[baseUrl] * 3;
score += scores.allowed;
}

// each blocked instance takes away points
if (this.blockedFederation[baseUrl]) {
if (this.blockedFederation && this.blockedFederation[baseUrl]) {
scores.blocked = parseInt("-" + this.blockedFederation[baseUrl] * 10);
score += scores.blocked;
}
Expand Down Expand Up @@ -554,24 +602,31 @@ export default class OutputTrust {

const activeScore = community.counts.users_active_month * 20;

// console.log("instanceMetricsinstanceMetrics", instanceMetrics);

const score = instanceMetrics.score * activityScore * activeScore;

return score;
}

// given an array, get a d-duped list of all the baseurls, returns three arrays with counts for each
getFederationLists(instances) {
getFederationLists(
instances,
): [{ [key: string]: number }, { [key: string]: number }, { [key: string]: number }] {
// count instances by list
let linkedFederation = {};
let allowedFederation = {};
let blockedFederation = {};
let linkedFederation: { [key: string]: number } = {};
let allowedFederation: { [key: string]: number } = {};
let blockedFederation: { [key: string]: number } = {};

function dedupAddItem(list, baseUrl) {
function dedupAddItem(list: { [key: string]: number }, baseUrl: string) {
// only add strings
if (typeof baseUrl !== "string") {
return;
}

// trim
baseUrl = baseUrl.trim();

if (!list[baseUrl]) {
list[baseUrl] = 1;
} else {
Expand Down

0 comments on commit 88fcc60

Please sign in to comment.