From ce7a90905881050effe5c09fd532bcf84baeb1d5 Mon Sep 17 00:00:00 2001 From: tgxn Date: Mon, 6 Jan 2025 23:21:46 +0800 Subject: [PATCH 01/15] add mbin crawler basics --- crawler/src/bin/task.ts | 9 + crawler/src/crawl/mbin.ts | 554 ++++++++++++++++++++++++++++++++ crawler/src/lib/crawlStorage.ts | 3 + crawler/src/lib/storage/mbin.ts | 96 ++++++ crawler/src/queue/mbin.ts | 60 ++++ 5 files changed, 722 insertions(+) create mode 100644 crawler/src/crawl/mbin.ts create mode 100644 crawler/src/lib/storage/mbin.ts create mode 100644 crawler/src/queue/mbin.ts diff --git a/crawler/src/bin/task.ts b/crawler/src/bin/task.ts index 56ec818..62be9e4 100644 --- a/crawler/src/bin/task.ts +++ b/crawler/src/bin/task.ts @@ -13,6 +13,7 @@ import { syncCheckpoint } from "../output/sync_s3"; import CrawlUptime from "../crawl/uptime"; import CrawlFediseer from "../crawl/fediseer"; import CrawlKBin from "../crawl/kbin"; +import CrawlMBin from "../crawl/mbin"; import CrawlAged from "../util/aged"; import Failures from "../util/failures"; @@ -129,6 +130,14 @@ export default async function runTask(taskName: string) { break; + // create jobs for all known kbin instances + case "mbin": + const mbinScan = new CrawlMBin(); + // await mbinScan.createJobsAllKBin(); + await mbinScan.getInstances(); + + break; + // crawl the fediverse uptime immediately case "uptime": const uptime = new CrawlUptime(); diff --git a/crawler/src/crawl/mbin.ts b/crawler/src/crawl/mbin.ts new file mode 100644 index 0000000..eb8ebae --- /dev/null +++ b/crawler/src/crawl/mbin.ts @@ -0,0 +1,554 @@ +import path from "node:path"; +import util from "node:util"; +import { exec } from "node:child_process"; + +import logging from "../lib/logging"; + +import storage from "../lib/crawlStorage"; +import { IFediverseDataKeyValue } from "../lib/storage/fediverse"; + +import { CrawlError, CrawlTooRecentError } from "../lib/error"; + +import MBinQueue from "../queue/mbin"; +import InstanceQueue from "../queue/instance"; + +import CrawlClient from "../lib/CrawlClient"; + +import { CRAWL_AGED_TIME } from "../lib/const"; + +const TIME_BETWEEN_PAGES = 2000; + +const RETRY_COUNT = 2; +const RETRY_PAGE_COUNT = 2; +const TIME_BETWEEN_RETRIES = 1000; + +const PAGE_TIMEOUT = 5000; + +type IIncomingMagazineData = { + magazineId: number; + owner: { + magazineId: number; + userId: number; + avatar: any; + username: string; + apId: any; + }; + icon: any; + name: string; + title: string; + description: string; + rules: string; + subscriptionsCount: number; + entryCount: number; + entryCommentCount: number; + postCount: number; + postCommentCount: number; + isAdult: boolean; + isUserSubscribed: any; + isBlockedByUser: any; + tags: any; + badges: any[]; + moderators: { + magazineId: number; + userId: number; + avatar: any; + username: string; + apId: any; + }[]; + apId: any; + apProfileId: string; + serverSoftware: any; + serverSoftwareVersion: any; + isPostingRestrictedToMods: boolean; +}; + +export default class CrawlMBin { + private fediverseData: IFediverseDataKeyValue | null; + private logPrefix: string; + + private instanceQueue: InstanceQueue; + + private client: CrawlClient; + + constructor() { + this.fediverseData = null; + this.logPrefix = `[CrawlMBin]`; + + this.instanceQueue = new InstanceQueue(false); + + this.client = new CrawlClient(); + } + + async getInstances() { + this.fediverseData = await storage.fediverse.getAll(); + + const mbinFedServersDateFiltered = Object.entries(this.fediverseData) + .filter((fediServer) => { + return fediServer[1].name === "mbin"; + }) + + .filter((fediServer) => { + if (!fediServer[1].time) return true; + + // remove all mbin instanced not crawled in the last 24 hours + return Date.now() - fediServer[1].time < CRAWL_AGED_TIME.FEDIVERSE; + }) + + .map((fediServer) => { + return { + base: fediServer[0].replace("fediverse:", ""), + ...fediServer[1], + }; + }); + + logging.info("mb", mbinFedServersDateFiltered.length); + + const instanceData = await this.crawlInstanceData("fedia.io"); + + const magazinesData = await this.getMagazinesData("fedia.io"); + + console.log("magazinesData", magazinesData.length); + console.log("magazinesData", magazinesData[0]); + + return mbinFedServersDateFiltered; + } + + // scan the full list of fediverse marked instances with "kbin" + async createJobsAllMBin() { + try { + // get all fedi kbin servers + const mbinServers = await this.getInstances(); + logging.info(`MBin Instances Total: ${mbinServers.length}`); + + const mbinQueue = new MBinQueue(false); + for (const mbinServer of mbinServers) { + this.logPrefix = `[CrawlMBin] [${mbinServer.base}]`; + console.log(`${this.logPrefix} create job ${mbinServer.base}`); + + await mbinQueue.createJob(mbinServer.base); + } + } catch (e) { + console.error(`${this.logPrefix} error scanning kbin instance`, e); + } + } + + async crawlInstanceData(crawlDomain: string) { + const nodeInfo = await this.getNodeInfo(crawlDomain); + + console.log(`${this.logPrefix} [${crawlDomain}] nodeInfo`, nodeInfo); + + if (!nodeInfo.software) { + throw new CrawlError("no software key found for " + crawlDomain); + } + + // store all fediverse instance software for easy metrics + await storage.fediverse.upsert(crawlDomain, nodeInfo.software); + + // only allow mbin instances + if (nodeInfo.software.name != "mbin") { + throw new CrawlError(`not a mbin instance (${nodeInfo.software.name})`); + } + + const [siteInfo, siteHeaders] = await this.getSiteInfo(crawlDomain); + console.log(`${crawlDomain}: found mbin instance`, siteHeaders, siteInfo); + // console.log(`${crawlDomain}: found mbin instance`, siteHeaders); + + // if (siteInfo.websiteDomain !== crawlDomain) { + // console.error(`${crawlDomain}: mismatched domain`, siteInfo.websiteDomain); + // throw new CrawlError(`${crawlDomain}: mismatched domain`, siteInfo.websiteDomain); + // } + + if (siteInfo.websiteDomain !== crawlDomain) { + console.error( + `${crawlDomain}: actor id does not match instance domain: ${siteInfo.websiteDomain} !== ${crawlDomain}`, + ); + throw new CrawlError( + `${crawlDomain}: actor id does not match instance domain: ${siteInfo.websiteDomain} !== ${crawlDomain}`, + ); + } + + return siteInfo; + } + + async getNodeInfo(crawlDomain: string) { + const wellKnownUrl = "https://" + crawlDomain + "/.well-known/nodeinfo"; + + console.log(`${this.logPrefix} [${crawlDomain}] wellKnownUrl`, wellKnownUrl); + + const wellKnownInfo = await this.client.getUrlWithRetry( + wellKnownUrl, + { + timeout: 10000, // smaller for nodeinfo + }, + 2, + ); + + let nodeinfoUrl: string | null = null; + if (!wellKnownInfo.data.links) { + throw new CrawlError("missing /.well-known/nodeinfo links"); + } + + for (var linkRel of wellKnownInfo.data.links) { + if ( + linkRel.rel == "http://nodeinfo.diaspora.software/ns/schema/2.0" || + linkRel.rel == "http://nodeinfo.diaspora.software/ns/schema/2.1" + ) { + nodeinfoUrl = linkRel.href; + } + } + if (!nodeinfoUrl) { + throw new CrawlError("no diaspora rel in /.well-known/nodeinfo"); + } + + const nodeNodeInfoData = await this.client.getUrlWithRetry(nodeinfoUrl); + return nodeNodeInfoData.data; + } + + async getSiteInfo(crawlDomain: string) { + const siteInfo = await this.client.getUrlWithRetry("https://" + crawlDomain + "/api/info"); + + return [siteInfo.data, siteInfo.headers]; + } + + async getMagazinesData(crawlDomain: string, pageNumber: number = 1): Promise { + const communities = await this.getPageData(crawlDomain, pageNumber); + + logging.debug(`${this.logPrefix} Page ${pageNumber}, Results: ${communities.length}`); + + // promises track the upsert of Magazine data + let magazinesData: IIncomingMagazineData[] = communities; + + // if this page had non-zero results + if (communities.length > 0) { + // upsert the page's magazine's data + let promises: Promise[] = []; + for (var magazineData of communities) { + promises.push(this.storeMagazineData(crawlDomain, magazineData)); + } + await Promise.all(promises); + + // sleep between pages + await new Promise((resolve) => setTimeout(resolve, TIME_BETWEEN_PAGES)); + + const subPromises = await this.getMagazinesData(crawlDomain, pageNumber + 1); + if (subPromises.length > 0) { + magazinesData.push(...subPromises); + } + } + + return magazinesData; + } + + async getPageData(crawlDomain: string, pageNumber: number = 1): Promise { + logging.debug(`${this.logPrefix} Page ${pageNumber}, Fetching...`); + + try { + let magazineList = await this.client.getUrlWithRetry( + "https://" + crawlDomain + "/api/magazines", + { + params: { + p: pageNumber, + perPage: 50, + federation: "local", + hide_adult: "show", + }, + timeout: PAGE_TIMEOUT, + }, + RETRY_PAGE_COUNT, // retry count per-page + ); + + const magazines = magazineList.data.items; + + // must be an array + if (!Array.isArray(magazines)) { + logging.error(`${this.logPrefix} Community list not an array:`, magazineList.data.substr(0, 15)); + throw new CrawlError(`Community list not an array: ${magazines}`); + } + + return magazines; + } catch (e) { + // mbin will return a 404 at the end of results + if (e.response.status === 404 && pageNumber > 1) { + return []; + } + + // throw new CrawlError("Failed to get community page"); + throw new CrawlError(e.message, e); + } + } + + // validate the community is for the domain being scanned, and save it + async storeMagazineData(crawlDomain: string, magazineData: IIncomingMagazineData) { + // const { basePart, communityPart } = this.splitCommunityActorParts(community.community.actor_id); + + // console.log(`${this.logPrefix} [${magazineData.name}]`); + + // validate the community actor_id matches the domain + // if (basePart != this.crawlDomain || communityPart != community.community.name) { + // logging.error( + // `${this.logPrefix} Community actor_id does not match domain: ${community.community.actor_id} ${community.community.name}`, + // ); + // return false; + // } + + await storage.mbin.upsert(crawlDomain, magazineData); + + // await storage.community.setTrackedAttribute( + // this.crawlDomain, + // communityPart, + // "subscribers", + // community.counts.subscribers, + // ); + + // if (community.counts.hot_rank) { + // await storage.community.setTrackedAttribute( + // this.crawlDomain, + // communityPart, + // "hot_rank", + // community.counts.hot_rank, + // ); + // } + + // if (community.counts.posts) { + // await storage.community.setTrackedAttribute( + // this.crawlDomain, + // communityPart, + // "posts", + // community.counts.posts, + // ); + // } + + // if (community.counts.comments) { + // await storage.community.setTrackedAttribute( + // this.crawlDomain, + // communityPart, + // "comments", + // community.counts.comments, + // ); + // } + + // if (community.counts.users_active_day) { + // await storage.community.setTrackedAttribute( + // this.crawlDomain, + // communityPart, + // "users_active_day", + // community.counts.users_active_day, + // ); + // } + + // if (community.counts.users_active_week) { + // await storage.community.setTrackedAttribute( + // this.crawlDomain, + // communityPart, + // "users_active_week", + // community.counts.users_active_week, + // ); + // } + + // if (community.counts.users_active_month) { + // await storage.community.setTrackedAttribute( + // this.crawlDomain, + // communityPart, + // "users_active_month", + // community.counts.users_active_month, + // ); + // } + + return true; + } + + /** + * - `/api/federated` to get list of federated instances + * - `/api/info` to get instance info + * - `/api/magazines` to get list of magazines + * - `/api/magazine/{magazine_id}` to get magazine info + */ + + // // scan the full list of fediverse marked instances with "kbin" + // async createJobsAllKBin() { + // try { + // // get all fedi kbin servers + // const kbinServers = await this.getKBin(); + // logging.info(`KBin Instances Total: ${kbinServers.length}`); + + // const kbinQueue = new KBinQueue(false); + // for (const kbinServer of kbinServers) { + // this.logPrefix = `[CrawlKBin] [${kbinServer.base}]`; + // console.log(`${this.logPrefix} create job ${kbinServer.base}`); + + // await kbinQueue.createJob(kbinServer.base); + // } + // } catch (e) { + // console.error(`${this.logPrefix} error scanning kbin instance`, e); + // } + // } + + // // scan a single kbin instance's magazines + // async processOneInstance(kbinBaseUrl) { + // let sketchyList = await this.getSketch(kbinBaseUrl); + // sketchyList = sketchyList.filter((mag) => mag != ""); + // // fix spaces + // sketchyList = sketchyList.map((mag) => { + // if (mag.indexOf(" ") !== -1) { + // return mag.split(" ")[0]; + // } + // return mag; + // }); + + // const localMagazines = sketchyList.filter((mag) => { + // if (mag.indexOf("@") !== -1) { + // return false; + // } + // return true; + // }); + + // const nonLocalMagazines = sketchyList.filter((mag) => { + // if (mag.indexOf("@") !== -1) { + // return true; + // } + // return false; + // }); + + // console.log( + // `${this.logPrefix} [${kbinBaseUrl}] local: ${localMagazines.length} others: ${nonLocalMagazines.length} `, + // ); + + // if (localMagazines.length > 0) { + // for (const mag of localMagazines) { + // try { + // // check for recent scan of this magazine + // const lastCrawl = await storage.tracking.getLastCrawl("magazine", `${kbinBaseUrl}:${mag}`); + // if (lastCrawl) { + // const lastCrawledMsAgo = Date.now() - lastCrawl.time; + // throw new CrawlTooRecentError( + // `Skipping - Crawled too recently (${lastCrawledMsAgo / 1000}s ago)`, + // ); + // } + + // await this.getStoreMag(kbinBaseUrl, mag); + // } catch (e) { + // console.error(`${this.logPrefix} error scanning kbin MAG`, kbinBaseUrl, mag, e.message); + // } + // // await new Promise((resolve) => setTimeout(resolve, 1000)); + // } + // } + + // // create kbin job to scan non-local baseurls + // if (nonLocalMagazines.length > 0) { + // // const kbinQueue = new KBinQueue(false); + // for (const otherName of nonLocalMagazines) { + // // console.log(`${this.logPrefix} otherName`, otherName); + + // const split = otherName.split("@"); + // // console.log(`${this.logPrefix} split`, split); + + // if (split.length === 2) { + // // must have two parts, we only want the second bit after the @ + // // add to the instance queue to validate it is a kbin instance + // await this.instanceQueue.createJob(split[1]); + // } + // } + // } + + // return; + // } + + // async getStoreMag(kbinBaseUrl: string, mag) { + // const magazineInfo = await this.getMagazineInfo(kbinBaseUrl, mag); + + // if (magazineInfo.type === "Group") { + // const followerData = await this.getFollowupData(magazineInfo.followers); + // const followers = followerData.totalItems; + + // console.log(`got followers`, followers); + + // // save group + // const saveGroup = { + // baseurl: kbinBaseUrl, + // followerCount: followers, + // title: magazineInfo.name, + + // // name must overide the name from the api + // ...magazineInfo, + // name: mag, + // }; + // await storage.kbin.upsert(kbinBaseUrl, saveGroup); + // await storage.tracking.setLastCrawl("magazine", `${kbinBaseUrl}:${mag}`, { + // followers, + // }); + + // logging.info(`${this.logPrefix} mag: ${mag} Saved KBin Magazine`); + // } else { + // console.log(`${this.logPrefix} mag: ${mag} is not a group`, magazineInfo); + // } + + // return; + // } + + // // this calls the current method from here https://github.com/tgxn/lemmy-explorer/issues/100#issuecomment-1617444934 + // async getSketch(baseUrl) { + // var currentPath = process.cwd(); + // const printHelloCommand = `/bin/bash ${path.join(currentPath, "src", "crawl", "sketch.sh")} ${baseUrl}`; + // const results = await execAsync(printHelloCommand); + // // console.log(results.stdout); + + // const mappedArray = results.stdout.split("\n"); + + // if (!Array.isArray(mappedArray)) { + // throw new CrawlError(`failed to get sketch (${baseUrl}): ${results.stdout}`); + // } + + // return mappedArray; + // } + + // // uses non-documented api on instances to get a json list of all kbin magazine data + // async getMagazineInfo(baseUrl, magazineName) { + // console.log(`${this.logPrefix} getMagazineInfo`, "https://" + baseUrl + "/m/" + magazineName); + // const magazineInfo = await this.client.getUrlWithRetry( + // "https://" + baseUrl + "/m/" + magazineName, + // { + // headers: { + // "Content-Type": "application/ld+json", + // Accept: "application/ld+json", + // }, + // }, + // 1, + // ); + + // return magazineInfo.data; + // } + + // async getFollowupData(wellKnownUrl) { + // const wellKnownInfo = await this.client.getUrlWithRetry( + // wellKnownUrl, + // { + // headers: { + // "Content-Type": "application/ld+json", + // Accept: "application/ld+json", + // }, + // }, + // 3, + // ); + // return wellKnownInfo.data; + // } + + // // get list of all known kbin servers + // async getKBin() { + // logging.info("Fetching KBin Instances"); + + // this.fediverseData = await storage.fediverse.getAll(); + + // const kbinFedServers = Object.entries(this.fediverseData) + // .filter((fediServer) => { + // return fediServer[1].name === "kbin"; + // }) + // .map((fediServer) => { + // return { + // base: fediServer[0].replace("fediverse:", ""), + // ...fediServer[1], + // }; + // }); + + // return kbinFedServers; + // } +} diff --git a/crawler/src/lib/crawlStorage.ts b/crawler/src/lib/crawlStorage.ts index 268a72a..f4f822a 100644 --- a/crawler/src/lib/crawlStorage.ts +++ b/crawler/src/lib/crawlStorage.ts @@ -7,6 +7,7 @@ import logging from "./logging"; import InstanceStore from "./storage/instance"; import CommunityStore from "./storage/community"; import KBinStore from "./storage/kbin"; +import MBinStore from "./storage/mbin"; // supporting import FediverseStore from "./storage/fediverse"; @@ -29,6 +30,7 @@ export class CrawlStorage { public fediseer: FediseerStore; public tracking: TrackingStore; public kbin: KBinStore; + public mbin: MBinStore; constructor() { logging.debug("CrawlStorage Constructed", REDIS_URL); @@ -45,6 +47,7 @@ export class CrawlStorage { this.fediseer = new FediseerStore(this); this.tracking = new TrackingStore(this); this.kbin = new KBinStore(this); + this.mbin = new MBinStore(this); } async connect() { diff --git a/crawler/src/lib/storage/mbin.ts b/crawler/src/lib/storage/mbin.ts new file mode 100644 index 0000000..7d27ded --- /dev/null +++ b/crawler/src/lib/storage/mbin.ts @@ -0,0 +1,96 @@ +import { CrawlStorage } from "../crawlStorage"; + +export type IMagazineData = { + magazineId: number; + owner: { + magazineId: number; + userId: number; + avatar: any; + username: string; + apId: any; + }; + icon: any; + name: string; + title: string; + description: string; + rules: string; + subscriptionsCount: number; + entryCount: number; + entryCommentCount: number; + postCount: number; + postCommentCount: number; + isAdult: boolean; + isUserSubscribed: any; + isBlockedByUser: any; + tags: any; + badges: any[]; + moderators: { + magazineId: number; + userId: number; + avatar: any; + username: string; + apId: any; + }[]; + apId: any; + apProfileId: string; + serverSoftware: any; + serverSoftwareVersion: any; + isPostingRestrictedToMods: boolean; + lastCrawled?: number; +}; +export type IMagazineDataKeyValue = { + [key: string]: IMagazineData; +}; + +export default class MBinStore { + private storage: CrawlStorage; + + constructor(storage: CrawlStorage) { + this.storage = storage; + } + + async getAll(): Promise { + return this.storage.listRedis(`mbin_magazine:*`); + } + + async getAllWithKeys(): Promise { + return this.storage.listRedisWithKeys(`mbin_magazine:*`); + } + + async getOne(baseUrl: string, magazineName: string) { + return this.storage.getRedis(`mbin_magazine:${baseUrl}:${magazineName}`); + } + + async upsert(baseUrl: string, magazine: IMagazineData) { + const storeData = { + ...magazine, + lastCrawled: Date.now(), + }; + return this.storage.putRedis(`mbin_magazine:${baseUrl}:${magazine.name.toLowerCase()}`, storeData); + } + + async delete(baseUrl: string, magazineName: string, reason = "unknown") { + const oldRecord = await this.getOne(baseUrl, magazineName); + await this.storage.putRedis(`deleted:mbin_magazine:${baseUrl}:${magazineName}`, { + ...oldRecord, + deletedAt: Date.now(), + deleteReason: reason, + }); + + return this.storage.deleteRedis(`mbin_magazine:${baseUrl}:${magazineName}`); + } + + // use these to track magazine attributes over time + async setTrackedAttribute( + baseUrl: string, + magazineName: string, + attributeName: string, + attributeValue: string, + ) { + return await this.storage.redisZAdd( + `attributes:mbin_magazine:${baseUrl}:${magazineName}:${attributeName}`, + Date.now(), + attributeValue, + ); + } +} diff --git a/crawler/src/queue/mbin.ts b/crawler/src/queue/mbin.ts new file mode 100644 index 0000000..274a671 --- /dev/null +++ b/crawler/src/queue/mbin.ts @@ -0,0 +1,60 @@ +import logging from "../lib/logging"; +import storage from "../lib/crawlStorage"; + +import { CrawlTooRecentError } from "../lib/error"; + +import BaseQueue, { IJobProcessor, ISuccessCallback } from "./BaseQueue"; + +import MBinCrawler from "../crawl/mbin"; + +export default class MBinQueue extends BaseQueue { + constructor(isWorker = false, queueName = "kbin") { + const processor: IJobProcessor = async ({ baseUrl }) => { + const startTime = Date.now(); + + try { + // check for recent scan of this KBIN instance + const lastCrawl = await storage.tracking.getLastCrawl("mbin", baseUrl); + if (lastCrawl) { + const lastCrawledMsAgo = Date.now() - lastCrawl.time; + throw new CrawlTooRecentError(`Skipping - Crawled too recently (${lastCrawledMsAgo / 1000}s ago)`); + } + + const crawler = new MBinCrawler(); + const mbinInstance = await crawler.processOneInstance(baseUrl); + + await storage.tracking.setLastCrawl("mbin", baseUrl, { + duration: (Date.now() - startTime) / 1000, + }); + + return mbinInstance; + } catch (error) { + if (error instanceof CrawlTooRecentError) { + logging.warn(`[MBinQueue] [${baseUrl}] CrawlTooRecentError: ${error.message}`); + return true; + } + + const errorDetail = { + error: error.message, + // stack: error.stack, + isAxiosError: error.isAxiosError, + requestUrl: error.isAxiosError ? error.request.url : null, + time: Date.now(), + }; + + await storage.tracking.upsertError("mbin", baseUrl, errorDetail); + + logging.error(`[MBinQueue] [${baseUrl}] Error: ${error.message}`, error); + } + + return false; + }; + + super(isWorker, queueName, processor); + } + + // use as KBinQueue.createJob({ baseUrl: "https://kbin.io" }); + async createJob(baseUrl: string, onSuccess: ISuccessCallback = null) { + await super.createJob(baseUrl, { baseUrl }, onSuccess); + } +} From 5e85c877328d29de4b8019a50ba981b309a3cc78 Mon Sep 17 00:00:00 2001 From: tgxn Date: Tue, 7 Jan 2025 00:04:07 +0800 Subject: [PATCH 02/15] mbin loaders --- crawler/src/bin/manual.ts | 11 + crawler/src/bin/task.ts | 5 +- crawler/src/bin/worker.ts | 18 ++ crawler/src/crawl/mbin.ts | 425 ++++++++++---------------------- crawler/src/lib/const.ts | 1 + crawler/src/lib/crawlStorage.ts | 2 +- crawler/src/lib/storage/mbin.ts | 2 +- crawler/src/queue/mbin.ts | 49 +--- 8 files changed, 171 insertions(+), 342 deletions(-) diff --git a/crawler/src/bin/manual.ts b/crawler/src/bin/manual.ts index 41c0b0f..4c54b8a 100644 --- a/crawler/src/bin/manual.ts +++ b/crawler/src/bin/manual.ts @@ -4,6 +4,7 @@ import InstanceQueue from "../queue/instance"; import CommunityQueue from "../queue/community_list"; import SingleCommunityQueue from "../queue/community_single"; import KBinQueue from "../queue/kbin"; +import MBinQueue from "../queue/mbin"; export default async function runManualWorker(workerName: string, firstParam: string, secondParam: string) { // scan one instance @@ -45,4 +46,14 @@ export default async function runManualWorker(workerName: string, firstParam: st process.exit(0); }); } + + // scan one mbin + else if (workerName == "m" || workerName == "mbin") { + logging.info(`Running Single MBin Crawl for ${firstParam}`); + const crawlMBinManual = new MBinQueue(true, "mbin_manual"); + await crawlMBinManual.createJob(firstParam, (resultData) => { + logging.info("MBIN Crawl Complete"); + process.exit(0); + }); + } } diff --git a/crawler/src/bin/task.ts b/crawler/src/bin/task.ts index 62be9e4..4ed835d 100644 --- a/crawler/src/bin/task.ts +++ b/crawler/src/bin/task.ts @@ -130,11 +130,10 @@ export default async function runTask(taskName: string) { break; - // create jobs for all known kbin instances + // create jobs for all known mbin instances case "mbin": const mbinScan = new CrawlMBin(); - // await mbinScan.createJobsAllKBin(); - await mbinScan.getInstances(); + await mbinScan.createJobsAllMBin(); break; diff --git a/crawler/src/bin/worker.ts b/crawler/src/bin/worker.ts index 940def0..8ff1016 100644 --- a/crawler/src/bin/worker.ts +++ b/crawler/src/bin/worker.ts @@ -8,11 +8,14 @@ import InstanceQueue from "../queue/instance"; import CommunityQueue from "../queue/community_list"; import SingleCommunityQueue from "../queue/community_single"; import KBinQueue from "../queue/kbin"; +import MBinQueue from "../queue/mbin"; +// used to create scheduled instance checks import CrawlAged from "../util/aged"; import CrawlFediseer from "../crawl/fediseer"; import CrawlUptime from "../crawl/uptime"; import CrawlKBin from "../crawl/kbin"; +import CrawlMBin from "../crawl/mbin"; import { syncCheckpoint } from "../output/sync_s3"; @@ -38,6 +41,9 @@ export default async function startWorker(startWorkerName: string) { } else if (startWorkerName == "kbin") { logging.info("Starting KBinQueue Processor"); new KBinQueue(true); + } else if (startWorkerName == "mbin") { + logging.info("Starting MBinQueue Processor"); + new MBinQueue(true); } // cron worker @@ -82,6 +88,18 @@ export default async function startWorker(startWorkerName: string) { await storage.close(); }); + // shares CRON_SCHEDULES.KBIN + logging.info("Creating KBin Cron Task", CRON_SCHEDULES.KBIN); + cron.schedule(CRON_SCHEDULES.MBIN, async (time) => { + console.log("Running MBin Cron Task", time); + await storage.connect(); + + const kbinScan = new CrawlMBin(); + await kbinScan.createJobsAllMBin(); + + await storage.close(); + }); + logging.info("Creating Uptime Cron Task", CRON_SCHEDULES.UPTIME); cron.schedule(CRON_SCHEDULES.UPTIME, async (time) => { console.log("Running Uptime Cron Task", time); diff --git a/crawler/src/crawl/mbin.ts b/crawler/src/crawl/mbin.ts index eb8ebae..a810ed9 100644 --- a/crawler/src/crawl/mbin.ts +++ b/crawler/src/crawl/mbin.ts @@ -9,8 +9,9 @@ import { IFediverseDataKeyValue } from "../lib/storage/fediverse"; import { CrawlError, CrawlTooRecentError } from "../lib/error"; +import { IJobProcessor } from "../queue/BaseQueue"; + import MBinQueue from "../queue/mbin"; -import InstanceQueue from "../queue/instance"; import CrawlClient from "../lib/CrawlClient"; @@ -66,7 +67,7 @@ export default class CrawlMBin { private fediverseData: IFediverseDataKeyValue | null; private logPrefix: string; - private instanceQueue: InstanceQueue; + private mbinQueue: MBinQueue; private client: CrawlClient; @@ -74,11 +75,37 @@ export default class CrawlMBin { this.fediverseData = null; this.logPrefix = `[CrawlMBin]`; - this.instanceQueue = new InstanceQueue(false); + this.mbinQueue = new MBinQueue(false); this.client = new CrawlClient(); } + /** + * - `/api/federated` to get list of federated instances + * - `/api/info` to get instance info + * - `/api/magazines` to get list of magazines + * - `/api/magazine/{magazine_id}` to get magazine info + */ + + // scan the full list of fediverse marked instances with "kbin" + async createJobsAllMBin() { + try { + // get all fedi kbin servers + const mbinServers = await this.getInstances(); + logging.info(`MBin Instances Total: ${mbinServers.length}`); + + const mbinQueue = new MBinQueue(false); + for (const mbinServer of mbinServers) { + this.logPrefix = `[CrawlMBin] [${mbinServer.base}]`; + console.log(`${this.logPrefix} create job ${mbinServer.base}`); + + await mbinQueue.createJob(mbinServer.base); + } + } catch (e) { + console.error(`${this.logPrefix} error scanning kbin instance`, e); + } + } + async getInstances() { this.fediverseData = await storage.fediverse.getAll(); @@ -103,35 +130,9 @@ export default class CrawlMBin { logging.info("mb", mbinFedServersDateFiltered.length); - const instanceData = await this.crawlInstanceData("fedia.io"); - - const magazinesData = await this.getMagazinesData("fedia.io"); - - console.log("magazinesData", magazinesData.length); - console.log("magazinesData", magazinesData[0]); - return mbinFedServersDateFiltered; } - // scan the full list of fediverse marked instances with "kbin" - async createJobsAllMBin() { - try { - // get all fedi kbin servers - const mbinServers = await this.getInstances(); - logging.info(`MBin Instances Total: ${mbinServers.length}`); - - const mbinQueue = new MBinQueue(false); - for (const mbinServer of mbinServers) { - this.logPrefix = `[CrawlMBin] [${mbinServer.base}]`; - console.log(`${this.logPrefix} create job ${mbinServer.base}`); - - await mbinQueue.createJob(mbinServer.base); - } - } catch (e) { - console.error(`${this.logPrefix} error scanning kbin instance`, e); - } - } - async crawlInstanceData(crawlDomain: string) { const nodeInfo = await this.getNodeInfo(crawlDomain); @@ -210,7 +211,34 @@ export default class CrawlMBin { return [siteInfo.data, siteInfo.headers]; } - async getMagazinesData(crawlDomain: string, pageNumber: number = 1): Promise { + async crawlFederatedInstances(crawlDomain: string) { + const fedReq = await this.client.getUrlWithRetry("https://" + crawlDomain + "/api/federated"); + + const federatedInstances = fedReq.data.instances; + + console.log(`${this.logPrefix} [${crawlDomain}] federatedInstances`, federatedInstances.length); + + for (var instance of federatedInstances) { + // if it has a software and domain, we put it in fediverse table + if (instance.domain && instance.software) { + await storage.fediverse.upsert(instance.domain, { + name: instance.software, + version: instance.version, + time: Date.now(), + }); + // console.log(`${this.logPrefix} [${crawlDomain}] upserted ${instance.software}:${instance.domain}`); + } + + if (instance.software === "mbin") { + console.log(`${this.logPrefix} [${crawlDomain}] create job ${instance.domain}`); + this.mbinQueue.createJob(instance.domain); + } + } + + return federatedInstances; + } + + async crawlMagazinesData(crawlDomain: string, pageNumber: number = 1): Promise { const communities = await this.getPageData(crawlDomain, pageNumber); logging.debug(`${this.logPrefix} Page ${pageNumber}, Results: ${communities.length}`); @@ -230,7 +258,7 @@ export default class CrawlMBin { // sleep between pages await new Promise((resolve) => setTimeout(resolve, TIME_BETWEEN_PAGES)); - const subPromises = await this.getMagazinesData(crawlDomain, pageNumber + 1); + const subPromises = await this.crawlMagazinesData(crawlDomain, pageNumber + 1); if (subPromises.length > 0) { magazinesData.push(...subPromises); } @@ -279,276 +307,89 @@ export default class CrawlMBin { // validate the community is for the domain being scanned, and save it async storeMagazineData(crawlDomain: string, magazineData: IIncomingMagazineData) { - // const { basePart, communityPart } = this.splitCommunityActorParts(community.community.actor_id); + await storage.mbin.upsert(crawlDomain, magazineData); - // console.log(`${this.logPrefix} [${magazineData.name}]`); + await storage.mbin.setTrackedAttribute( + crawlDomain, + magazineData.name, + "subscriptionsCount", + magazineData.subscriptionsCount, + ); - // validate the community actor_id matches the domain - // if (basePart != this.crawlDomain || communityPart != community.community.name) { - // logging.error( - // `${this.logPrefix} Community actor_id does not match domain: ${community.community.actor_id} ${community.community.name}`, - // ); - // return false; - // } + await storage.mbin.setTrackedAttribute( + crawlDomain, + magazineData.name, + "postCount", + magazineData.postCount, + ); - await storage.mbin.upsert(crawlDomain, magazineData); + await storage.mbin.setTrackedAttribute( + crawlDomain, + magazineData.name, + "postCommentCount", + magazineData.postCommentCount, + ); - // await storage.community.setTrackedAttribute( - // this.crawlDomain, - // communityPart, - // "subscribers", - // community.counts.subscribers, - // ); - - // if (community.counts.hot_rank) { - // await storage.community.setTrackedAttribute( - // this.crawlDomain, - // communityPart, - // "hot_rank", - // community.counts.hot_rank, - // ); - // } + return true; + } +} - // if (community.counts.posts) { - // await storage.community.setTrackedAttribute( - // this.crawlDomain, - // communityPart, - // "posts", - // community.counts.posts, - // ); - // } +export const mbinInstanceProcessor: IJobProcessor = async ({ baseUrl }) => { + const startTime = Date.now(); - // if (community.counts.comments) { - // await storage.community.setTrackedAttribute( - // this.crawlDomain, - // communityPart, - // "comments", - // community.counts.comments, - // ); - // } + try { + // check for recent scan of this KBIN instance + const lastCrawl = await storage.tracking.getLastCrawl("mbin", baseUrl); + if (lastCrawl) { + const lastCrawledMsAgo = Date.now() - lastCrawl.time; + throw new CrawlTooRecentError(`Skipping - Crawled too recently (${lastCrawledMsAgo / 1000}s ago)`); + } - // if (community.counts.users_active_day) { - // await storage.community.setTrackedAttribute( - // this.crawlDomain, - // communityPart, - // "users_active_day", - // community.counts.users_active_day, - // ); - // } + // check for recent error + const lastError = await storage.tracking.getOneError("mbin", baseUrl); + if (lastError?.time) { + const lastErrorTime = lastError.time; + const now = Date.now(); - // if (community.counts.users_active_week) { - // await storage.community.setTrackedAttribute( - // this.crawlDomain, - // communityPart, - // "users_active_week", - // community.counts.users_active_week, - // ); - // } + throw new CrawlTooRecentError(`Skipping - Error too recently (${(now - lastErrorTime) / 1000}s ago)`); + } - // if (community.counts.users_active_month) { - // await storage.community.setTrackedAttribute( - // this.crawlDomain, - // communityPart, - // "users_active_month", - // community.counts.users_active_month, - // ); - // } + const crawler = new CrawlMBin(); - return true; - } + const instanceData = await crawler.crawlInstanceData(baseUrl); - /** - * - `/api/federated` to get list of federated instances - * - `/api/info` to get instance info - * - `/api/magazines` to get list of magazines - * - `/api/magazine/{magazine_id}` to get magazine info - */ + await crawler.crawlFederatedInstances(baseUrl); - // // scan the full list of fediverse marked instances with "kbin" - // async createJobsAllKBin() { - // try { - // // get all fedi kbin servers - // const kbinServers = await this.getKBin(); - // logging.info(`KBin Instances Total: ${kbinServers.length}`); - - // const kbinQueue = new KBinQueue(false); - // for (const kbinServer of kbinServers) { - // this.logPrefix = `[CrawlKBin] [${kbinServer.base}]`; - // console.log(`${this.logPrefix} create job ${kbinServer.base}`); - - // await kbinQueue.createJob(kbinServer.base); - // } - // } catch (e) { - // console.error(`${this.logPrefix} error scanning kbin instance`, e); - // } - // } - - // // scan a single kbin instance's magazines - // async processOneInstance(kbinBaseUrl) { - // let sketchyList = await this.getSketch(kbinBaseUrl); - // sketchyList = sketchyList.filter((mag) => mag != ""); - // // fix spaces - // sketchyList = sketchyList.map((mag) => { - // if (mag.indexOf(" ") !== -1) { - // return mag.split(" ")[0]; - // } - // return mag; - // }); - - // const localMagazines = sketchyList.filter((mag) => { - // if (mag.indexOf("@") !== -1) { - // return false; - // } - // return true; - // }); - - // const nonLocalMagazines = sketchyList.filter((mag) => { - // if (mag.indexOf("@") !== -1) { - // return true; - // } - // return false; - // }); - - // console.log( - // `${this.logPrefix} [${kbinBaseUrl}] local: ${localMagazines.length} others: ${nonLocalMagazines.length} `, - // ); - - // if (localMagazines.length > 0) { - // for (const mag of localMagazines) { - // try { - // // check for recent scan of this magazine - // const lastCrawl = await storage.tracking.getLastCrawl("magazine", `${kbinBaseUrl}:${mag}`); - // if (lastCrawl) { - // const lastCrawledMsAgo = Date.now() - lastCrawl.time; - // throw new CrawlTooRecentError( - // `Skipping - Crawled too recently (${lastCrawledMsAgo / 1000}s ago)`, - // ); - // } - - // await this.getStoreMag(kbinBaseUrl, mag); - // } catch (e) { - // console.error(`${this.logPrefix} error scanning kbin MAG`, kbinBaseUrl, mag, e.message); - // } - // // await new Promise((resolve) => setTimeout(resolve, 1000)); - // } - // } - - // // create kbin job to scan non-local baseurls - // if (nonLocalMagazines.length > 0) { - // // const kbinQueue = new KBinQueue(false); - // for (const otherName of nonLocalMagazines) { - // // console.log(`${this.logPrefix} otherName`, otherName); - - // const split = otherName.split("@"); - // // console.log(`${this.logPrefix} split`, split); - - // if (split.length === 2) { - // // must have two parts, we only want the second bit after the @ - // // add to the instance queue to validate it is a kbin instance - // await this.instanceQueue.createJob(split[1]); - // } - // } - // } - - // return; - // } - - // async getStoreMag(kbinBaseUrl: string, mag) { - // const magazineInfo = await this.getMagazineInfo(kbinBaseUrl, mag); - - // if (magazineInfo.type === "Group") { - // const followerData = await this.getFollowupData(magazineInfo.followers); - // const followers = followerData.totalItems; - - // console.log(`got followers`, followers); - - // // save group - // const saveGroup = { - // baseurl: kbinBaseUrl, - // followerCount: followers, - // title: magazineInfo.name, - - // // name must overide the name from the api - // ...magazineInfo, - // name: mag, - // }; - // await storage.kbin.upsert(kbinBaseUrl, saveGroup); - // await storage.tracking.setLastCrawl("magazine", `${kbinBaseUrl}:${mag}`, { - // followers, - // }); - - // logging.info(`${this.logPrefix} mag: ${mag} Saved KBin Magazine`); - // } else { - // console.log(`${this.logPrefix} mag: ${mag} is not a group`, magazineInfo); - // } - - // return; - // } - - // // this calls the current method from here https://github.com/tgxn/lemmy-explorer/issues/100#issuecomment-1617444934 - // async getSketch(baseUrl) { - // var currentPath = process.cwd(); - // const printHelloCommand = `/bin/bash ${path.join(currentPath, "src", "crawl", "sketch.sh")} ${baseUrl}`; - // const results = await execAsync(printHelloCommand); - // // console.log(results.stdout); - - // const mappedArray = results.stdout.split("\n"); - - // if (!Array.isArray(mappedArray)) { - // throw new CrawlError(`failed to get sketch (${baseUrl}): ${results.stdout}`); - // } - - // return mappedArray; - // } - - // // uses non-documented api on instances to get a json list of all kbin magazine data - // async getMagazineInfo(baseUrl, magazineName) { - // console.log(`${this.logPrefix} getMagazineInfo`, "https://" + baseUrl + "/m/" + magazineName); - // const magazineInfo = await this.client.getUrlWithRetry( - // "https://" + baseUrl + "/m/" + magazineName, - // { - // headers: { - // "Content-Type": "application/ld+json", - // Accept: "application/ld+json", - // }, - // }, - // 1, - // ); - - // return magazineInfo.data; - // } - - // async getFollowupData(wellKnownUrl) { - // const wellKnownInfo = await this.client.getUrlWithRetry( - // wellKnownUrl, - // { - // headers: { - // "Content-Type": "application/ld+json", - // Accept: "application/ld+json", - // }, - // }, - // 3, - // ); - // return wellKnownInfo.data; - // } - - // // get list of all known kbin servers - // async getKBin() { - // logging.info("Fetching KBin Instances"); - - // this.fediverseData = await storage.fediverse.getAll(); - - // const kbinFedServers = Object.entries(this.fediverseData) - // .filter((fediServer) => { - // return fediServer[1].name === "kbin"; - // }) - // .map((fediServer) => { - // return { - // base: fediServer[0].replace("fediverse:", ""), - // ...fediServer[1], - // }; - // }); - - // return kbinFedServers; - // } -} + const magazinesData = await crawler.crawlMagazinesData(baseUrl); + + console.log("magazinesData", magazinesData.length); + // console.log("magazinesData", magazinesData[0]); + + await storage.tracking.setLastCrawl("mbin", `${baseUrl}`, instanceData); + + await storage.tracking.setLastCrawl("mbin", baseUrl, { + duration: (Date.now() - startTime) / 1000, + }); + + return magazinesData; + } catch (error) { + if (error instanceof CrawlTooRecentError) { + logging.warn(`[MBinQueue] [${baseUrl}] CrawlTooRecentError: ${error.message}`); + return true; + } + + const errorDetail = { + error: error.message, + // stack: error.stack, + isAxiosError: error.isAxiosError, + requestUrl: error.isAxiosError ? error.request.url : null, + time: Date.now(), + }; + + await storage.tracking.upsertError("mbin", baseUrl, errorDetail); + + logging.error(`[MBinQueue] [${baseUrl}] Error: ${error.message}`, error); + } + + return false; +}; diff --git a/crawler/src/lib/const.ts b/crawler/src/lib/const.ts index 6c5a72c..3c2cc5f 100644 --- a/crawler/src/lib/const.ts +++ b/crawler/src/lib/const.ts @@ -57,6 +57,7 @@ export const CRON_SCHEDULES = { AGED: "*/15 * * * *", UPTIME: "0 */12 * * *", KBIN: "0 */6 * * *", + MBIN: "0 */6 * * *", FEDISEER: "0 */12 * * *", }; diff --git a/crawler/src/lib/crawlStorage.ts b/crawler/src/lib/crawlStorage.ts index f4f822a..776c6bc 100644 --- a/crawler/src/lib/crawlStorage.ts +++ b/crawler/src/lib/crawlStorage.ts @@ -110,7 +110,7 @@ export class CrawlStorage { ); } - async redisZAdd(key: string, score: number, value: string): Promise { + async redisZAdd(key: string, score: number, value: string | number): Promise { if (typeof value !== "string") { value = JSON.stringify(value); } diff --git a/crawler/src/lib/storage/mbin.ts b/crawler/src/lib/storage/mbin.ts index 7d27ded..5a497d5 100644 --- a/crawler/src/lib/storage/mbin.ts +++ b/crawler/src/lib/storage/mbin.ts @@ -85,7 +85,7 @@ export default class MBinStore { baseUrl: string, magazineName: string, attributeName: string, - attributeValue: string, + attributeValue: string | number, ) { return await this.storage.redisZAdd( `attributes:mbin_magazine:${baseUrl}:${magazineName}:${attributeName}`, diff --git a/crawler/src/queue/mbin.ts b/crawler/src/queue/mbin.ts index 274a671..700ec2f 100644 --- a/crawler/src/queue/mbin.ts +++ b/crawler/src/queue/mbin.ts @@ -5,56 +5,15 @@ import { CrawlTooRecentError } from "../lib/error"; import BaseQueue, { IJobProcessor, ISuccessCallback } from "./BaseQueue"; -import MBinCrawler from "../crawl/mbin"; +import { mbinInstanceProcessor } from "../crawl/mbin"; export default class MBinQueue extends BaseQueue { - constructor(isWorker = false, queueName = "kbin") { - const processor: IJobProcessor = async ({ baseUrl }) => { - const startTime = Date.now(); - - try { - // check for recent scan of this KBIN instance - const lastCrawl = await storage.tracking.getLastCrawl("mbin", baseUrl); - if (lastCrawl) { - const lastCrawledMsAgo = Date.now() - lastCrawl.time; - throw new CrawlTooRecentError(`Skipping - Crawled too recently (${lastCrawledMsAgo / 1000}s ago)`); - } - - const crawler = new MBinCrawler(); - const mbinInstance = await crawler.processOneInstance(baseUrl); - - await storage.tracking.setLastCrawl("mbin", baseUrl, { - duration: (Date.now() - startTime) / 1000, - }); - - return mbinInstance; - } catch (error) { - if (error instanceof CrawlTooRecentError) { - logging.warn(`[MBinQueue] [${baseUrl}] CrawlTooRecentError: ${error.message}`); - return true; - } - - const errorDetail = { - error: error.message, - // stack: error.stack, - isAxiosError: error.isAxiosError, - requestUrl: error.isAxiosError ? error.request.url : null, - time: Date.now(), - }; - - await storage.tracking.upsertError("mbin", baseUrl, errorDetail); - - logging.error(`[MBinQueue] [${baseUrl}] Error: ${error.message}`, error); - } - - return false; - }; - - super(isWorker, queueName, processor); + constructor(isWorker = false, queueName = "mbin") { + super(isWorker, queueName, mbinInstanceProcessor); } // use as KBinQueue.createJob({ baseUrl: "https://kbin.io" }); - async createJob(baseUrl: string, onSuccess: ISuccessCallback = null) { + async createJob(baseUrl: string, onSuccess: ISuccessCallback | null = null) { await super.createJob(baseUrl, { baseUrl }, onSuccess); } } From 51828557a956430f994175690b60da79c1c999be Mon Sep 17 00:00:00 2001 From: tgxn Date: Tue, 7 Jan 2025 00:09:41 +0800 Subject: [PATCH 03/15] scanner --- crawler/src/crawl/mbin.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler/src/crawl/mbin.ts b/crawler/src/crawl/mbin.ts index a810ed9..73a1975 100644 --- a/crawler/src/crawl/mbin.ts +++ b/crawler/src/crawl/mbin.ts @@ -296,7 +296,7 @@ export default class CrawlMBin { return magazines; } catch (e) { // mbin will return a 404 at the end of results - if (e.response.status === 404 && pageNumber > 1) { + if (e.response?.status === 404 && pageNumber > 1) { return []; } From bb67696c12faa833d76c9d96c15d3e42262e108c Mon Sep 17 00:00:00 2001 From: tgxn Date: Tue, 7 Jan 2025 00:20:06 +0800 Subject: [PATCH 04/15] update ecosystem --- crawler/ecosystem.config.cjs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crawler/ecosystem.config.cjs b/crawler/ecosystem.config.cjs index 3a9494c..2430c8d 100644 --- a/crawler/ecosystem.config.cjs +++ b/crawler/ecosystem.config.cjs @@ -37,10 +37,10 @@ module.exports = { }, { ...defaultOptions, - output: "./.data/logs/kbin.log", - name: "crawl-kbin", - args: ["-w", "kbin"], - instances: 4, + output: "./.data/logs/mbin.log", + name: "crawl-mbin", + args: ["-w", "mbin"], + instances: 2, }, ], }; From 2514b4aafd7e078ae2c0555c6e57073001a487ec Mon Sep 17 00:00:00 2001 From: tgxn Date: Tue, 7 Jan 2025 00:24:34 +0800 Subject: [PATCH 05/15] strip out kbin scanning --- crawler/src/bin/manual.ts | 11 -- crawler/src/bin/task.ts | 18 +-- crawler/src/bin/worker.ts | 19 +-- crawler/src/crawl/kbin.ts | 223 ------------------------------------ crawler/src/crawl/sketch.sh | 22 ---- crawler/src/lib/const.ts | 1 - crawler/src/queue/kbin.ts | 60 ---------- 7 files changed, 6 insertions(+), 348 deletions(-) delete mode 100644 crawler/src/crawl/kbin.ts delete mode 100644 crawler/src/crawl/sketch.sh delete mode 100644 crawler/src/queue/kbin.ts diff --git a/crawler/src/bin/manual.ts b/crawler/src/bin/manual.ts index 4c54b8a..5a5898a 100644 --- a/crawler/src/bin/manual.ts +++ b/crawler/src/bin/manual.ts @@ -3,7 +3,6 @@ import logging from "../lib/logging"; import InstanceQueue from "../queue/instance"; import CommunityQueue from "../queue/community_list"; import SingleCommunityQueue from "../queue/community_single"; -import KBinQueue from "../queue/kbin"; import MBinQueue from "../queue/mbin"; export default async function runManualWorker(workerName: string, firstParam: string, secondParam: string) { @@ -37,16 +36,6 @@ export default async function runManualWorker(workerName: string, firstParam: st }); } - // scan one kbin - else if (workerName == "k" || workerName == "kbin") { - logging.info(`Running Singel Q Scan KBIN Crawl for ${firstParam}`); - const crawlKBinManual = new KBinQueue(true, "kbin_manual"); - await crawlKBinManual.createJob(firstParam, (resultData) => { - logging.info("KBIN Crawl Complete"); - process.exit(0); - }); - } - // scan one mbin else if (workerName == "m" || workerName == "mbin") { logging.info(`Running Single MBin Crawl for ${firstParam}`); diff --git a/crawler/src/bin/task.ts b/crawler/src/bin/task.ts index 4ed835d..5ceaa45 100644 --- a/crawler/src/bin/task.ts +++ b/crawler/src/bin/task.ts @@ -5,14 +5,13 @@ import storage from "../lib/crawlStorage"; import InstanceQueue from "../queue/instance"; import CommunityQueue from "../queue/community_list"; import SingleCommunityQueue from "../queue/community_single"; -import KBinQueue from "../queue/kbin"; +import MBinQueue from "../queue/mbin"; import CrawlOutput from "../output/output"; import { syncCheckpoint } from "../output/sync_s3"; import CrawlUptime from "../crawl/uptime"; import CrawlFediseer from "../crawl/fediseer"; -import CrawlKBin from "../crawl/kbin"; import CrawlMBin from "../crawl/mbin"; import CrawlAged from "../util/aged"; @@ -100,11 +99,11 @@ export default async function runTask(taskName: string) { ...commSingleCounts, }); - const kbinQHealthCrawl = new KBinQueue(false); - const kbinQHeCounts = await kbinQHealthCrawl.queue.checkHealth(); + const mbinQHealthCrawl = new MBinQueue(false); + const mbinQHeCounts = await mbinQHealthCrawl.queue.checkHealth(); healthData.push({ - queue: "KBinQueue", - ...kbinQHeCounts, + queue: "MBinQueue", + ...mbinQHeCounts, }); console.info("Queue Health Metrics"); @@ -123,13 +122,6 @@ export default async function runTask(taskName: string) { break; - // create jobs for all known kbin instances - case "kbin": - const kbinScan = new CrawlKBin(); - await kbinScan.createJobsAllKBin(); - - break; - // create jobs for all known mbin instances case "mbin": const mbinScan = new CrawlMBin(); diff --git a/crawler/src/bin/worker.ts b/crawler/src/bin/worker.ts index 8ff1016..d4be44b 100644 --- a/crawler/src/bin/worker.ts +++ b/crawler/src/bin/worker.ts @@ -7,14 +7,12 @@ import storage from "../lib/crawlStorage"; import InstanceQueue from "../queue/instance"; import CommunityQueue from "../queue/community_list"; import SingleCommunityQueue from "../queue/community_single"; -import KBinQueue from "../queue/kbin"; import MBinQueue from "../queue/mbin"; // used to create scheduled instance checks import CrawlAged from "../util/aged"; import CrawlFediseer from "../crawl/fediseer"; import CrawlUptime from "../crawl/uptime"; -import CrawlKBin from "../crawl/kbin"; import CrawlMBin from "../crawl/mbin"; import { syncCheckpoint } from "../output/sync_s3"; @@ -38,9 +36,6 @@ export default async function startWorker(startWorkerName: string) { } else if (startWorkerName == "single") { logging.info("Starting SingleCommunityQueue Processor"); new SingleCommunityQueue(true); - } else if (startWorkerName == "kbin") { - logging.info("Starting KBinQueue Processor"); - new KBinQueue(true); } else if (startWorkerName == "mbin") { logging.info("Starting MBinQueue Processor"); new MBinQueue(true); @@ -76,19 +71,7 @@ export default async function startWorker(startWorkerName: string) { }); } - // shares CRON_SCHEDULES.KBIN - logging.info("Creating KBin Cron Task", CRON_SCHEDULES.KBIN); - cron.schedule(CRON_SCHEDULES.KBIN, async (time) => { - console.log("Running KBin Cron Task", time); - await storage.connect(); - - const kbinScan = new CrawlKBin(); - await kbinScan.createJobsAllKBin(); - - await storage.close(); - }); - - // shares CRON_SCHEDULES.KBIN + // shares CRON_SCHEDULES.MBIN logging.info("Creating KBin Cron Task", CRON_SCHEDULES.KBIN); cron.schedule(CRON_SCHEDULES.MBIN, async (time) => { console.log("Running MBin Cron Task", time); diff --git a/crawler/src/crawl/kbin.ts b/crawler/src/crawl/kbin.ts deleted file mode 100644 index 518c00d..0000000 --- a/crawler/src/crawl/kbin.ts +++ /dev/null @@ -1,223 +0,0 @@ -import path from "node:path"; -import util from "node:util"; -import { exec } from "node:child_process"; - -import logging from "../lib/logging"; - -import storage from "../lib/crawlStorage"; -import { IFediverseDataKeyValue } from "../lib/storage/fediverse"; - -import { CrawlError, CrawlTooRecentError } from "../lib/error"; - -import KBinQueue from "../queue/kbin"; -import InstanceQueue from "../queue/instance"; - -import CrawlClient from "../lib/CrawlClient"; - -const execAsync = util.promisify(exec); - -export default class CrawlKBin { - private fediverseData: IFediverseDataKeyValue | null; - private logPrefix: string; - - private instanceQueue: InstanceQueue; - - private client: CrawlClient; - - constructor() { - this.fediverseData = null; - this.logPrefix = `[CrawlKBin]`; - - this.instanceQueue = new InstanceQueue(false); - - this.client = new CrawlClient(); - } - - // scan the full list of fediverse marked instances with "kbin" - async createJobsAllKBin() { - try { - // get all fedi kbin servers - const kbinServers = await this.getKBin(); - logging.info(`KBin Instances Total: ${kbinServers.length}`); - - const kbinQueue = new KBinQueue(false); - for (const kbinServer of kbinServers) { - this.logPrefix = `[CrawlKBin] [${kbinServer.base}]`; - console.log(`${this.logPrefix} create job ${kbinServer.base}`); - - await kbinQueue.createJob(kbinServer.base); - } - } catch (e) { - console.error(`${this.logPrefix} error scanning kbin instance`, e); - } - } - - // scan a single kbin instance's magazines - async processOneInstance(kbinBaseUrl) { - let sketchyList = await this.getSketch(kbinBaseUrl); - sketchyList = sketchyList.filter((mag) => mag != ""); - // fix spaces - sketchyList = sketchyList.map((mag) => { - if (mag.indexOf(" ") !== -1) { - return mag.split(" ")[0]; - } - return mag; - }); - - const localMagazines = sketchyList.filter((mag) => { - if (mag.indexOf("@") !== -1) { - return false; - } - return true; - }); - - const nonLocalMagazines = sketchyList.filter((mag) => { - if (mag.indexOf("@") !== -1) { - return true; - } - return false; - }); - - console.log( - `${this.logPrefix} [${kbinBaseUrl}] local: ${localMagazines.length} others: ${nonLocalMagazines.length} `, - ); - - if (localMagazines.length > 0) { - for (const mag of localMagazines) { - try { - // check for recent scan of this magazine - const lastCrawl = await storage.tracking.getLastCrawl("magazine", `${kbinBaseUrl}:${mag}`); - if (lastCrawl) { - const lastCrawledMsAgo = Date.now() - lastCrawl.time; - throw new CrawlTooRecentError( - `Skipping - Crawled too recently (${lastCrawledMsAgo / 1000}s ago)`, - ); - } - - await this.getStoreMag(kbinBaseUrl, mag); - } catch (e) { - console.error(`${this.logPrefix} error scanning kbin MAG`, kbinBaseUrl, mag, e.message); - } - // await new Promise((resolve) => setTimeout(resolve, 1000)); - } - } - - // create kbin job to scan non-local baseurls - if (nonLocalMagazines.length > 0) { - // const kbinQueue = new KBinQueue(false); - for (const otherName of nonLocalMagazines) { - // console.log(`${this.logPrefix} otherName`, otherName); - - const split = otherName.split("@"); - // console.log(`${this.logPrefix} split`, split); - - if (split.length === 2) { - // must have two parts, we only want the second bit after the @ - // add to the instance queue to validate it is a kbin instance - await this.instanceQueue.createJob(split[1]); - } - } - } - - return; - } - - async getStoreMag(kbinBaseUrl: string, mag) { - const magazineInfo = await this.getMagazineInfo(kbinBaseUrl, mag); - - if (magazineInfo.type === "Group") { - const followerData = await this.getFollowupData(magazineInfo.followers); - const followers = followerData.totalItems; - - console.log(`got followers`, followers); - - // save group - const saveGroup = { - baseurl: kbinBaseUrl, - followerCount: followers, - title: magazineInfo.name, - - // name must overide the name from the api - ...magazineInfo, - name: mag, - }; - await storage.kbin.upsert(kbinBaseUrl, saveGroup); - await storage.tracking.setLastCrawl("magazine", `${kbinBaseUrl}:${mag}`, { - followers, - }); - - logging.info(`${this.logPrefix} mag: ${mag} Saved KBin Magazine`); - } else { - console.log(`${this.logPrefix} mag: ${mag} is not a group`, magazineInfo); - } - - return; - } - - // this calls the current method from here https://github.com/tgxn/lemmy-explorer/issues/100#issuecomment-1617444934 - async getSketch(baseUrl) { - var currentPath = process.cwd(); - const printHelloCommand = `/bin/bash ${path.join(currentPath, "src", "crawl", "sketch.sh")} ${baseUrl}`; - const results = await execAsync(printHelloCommand); - // console.log(results.stdout); - - const mappedArray = results.stdout.split("\n"); - - if (!Array.isArray(mappedArray)) { - throw new CrawlError(`failed to get sketch (${baseUrl}): ${results.stdout}`); - } - - return mappedArray; - } - - // uses non-documented api on instances to get a json list of all kbin magazine data - async getMagazineInfo(baseUrl, magazineName) { - console.log(`${this.logPrefix} getMagazineInfo`, "https://" + baseUrl + "/m/" + magazineName); - const magazineInfo = await this.client.getUrlWithRetry( - "https://" + baseUrl + "/m/" + magazineName, - { - headers: { - "Content-Type": "application/ld+json", - Accept: "application/ld+json", - }, - }, - 1, - ); - - return magazineInfo.data; - } - - async getFollowupData(wellKnownUrl) { - const wellKnownInfo = await this.client.getUrlWithRetry( - wellKnownUrl, - { - headers: { - "Content-Type": "application/ld+json", - Accept: "application/ld+json", - }, - }, - 3, - ); - return wellKnownInfo.data; - } - - // get list of all known kbin servers - async getKBin() { - logging.info("Fetching KBin Instances"); - - this.fediverseData = await storage.fediverse.getAll(); - - const kbinFedServers = Object.entries(this.fediverseData) - .filter((fediServer) => { - return fediServer[1].name === "kbin"; - }) - .map((fediServer) => { - return { - base: fediServer[0].replace("fediverse:", ""), - ...fediServer[1], - }; - }); - - return kbinFedServers; - } -} diff --git a/crawler/src/crawl/sketch.sh b/crawler/src/crawl/sketch.sh deleted file mode 100644 index 4f07b19..0000000 --- a/crawler/src/crawl/sketch.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# Parses all kbin magazines listed on a given instance -# Pass the results to 'grep -v @' to filter out magazines on other instances - -INSTANCE=$1 - -if [ -z $INSTANCE ]; then - echo "Provide an instance as the only parameter (like kbin.social)" - exit 1 -fi - -function parse() { - PAGE=$1 - curl -m 10 --fail -s https://${INSTANCE}/magazines?p=$PAGE | grep form | grep '/subscribe' | sed 's/.*="\/m\/\(.*\)\/subscribe"/\1/g' - return ${PIPESTATUS[0]} -} - -for idx in $(seq 10000); do - if ! parse $idx; then - break - fi -done diff --git a/crawler/src/lib/const.ts b/crawler/src/lib/const.ts index 3c2cc5f..7c7fa1b 100644 --- a/crawler/src/lib/const.ts +++ b/crawler/src/lib/const.ts @@ -56,7 +56,6 @@ export const CRON_SCHEDULES = { PUBLISH_S3: process.env.PUBLISH_S3_CRON || "0 */4 * * *", // every 4 hours AGED: "*/15 * * * *", UPTIME: "0 */12 * * *", - KBIN: "0 */6 * * *", MBIN: "0 */6 * * *", FEDISEER: "0 */12 * * *", }; diff --git a/crawler/src/queue/kbin.ts b/crawler/src/queue/kbin.ts deleted file mode 100644 index 3fde807..0000000 --- a/crawler/src/queue/kbin.ts +++ /dev/null @@ -1,60 +0,0 @@ -import logging from "../lib/logging"; -import storage from "../lib/crawlStorage"; - -import { CrawlTooRecentError } from "../lib/error"; - -import BaseQueue, { IJobProcessor, ISuccessCallback } from "./BaseQueue"; - -import KBinCrawler from "../crawl/kbin"; - -export default class KBinQueue extends BaseQueue { - constructor(isWorker = false, queueName = "kbin") { - const processor: IJobProcessor = async ({ baseUrl }) => { - const startTime = Date.now(); - - try { - // check for recent scan of this KBIN instance - const lastCrawl = await storage.tracking.getLastCrawl("kbin", baseUrl); - if (lastCrawl) { - const lastCrawledMsAgo = Date.now() - lastCrawl.time; - throw new CrawlTooRecentError(`Skipping - Crawled too recently (${lastCrawledMsAgo / 1000}s ago)`); - } - - const crawler = new KBinCrawler(); - const kbinInstance = await crawler.processOneInstance(baseUrl); - - await storage.tracking.setLastCrawl("kbin", baseUrl, { - duration: (Date.now() - startTime) / 1000, - }); - - return kbinInstance; - } catch (error) { - if (error instanceof CrawlTooRecentError) { - logging.warn(`[KBinQueue] [${baseUrl}] CrawlTooRecentError: ${error.message}`); - return true; - } - - const errorDetail = { - error: error.message, - // stack: error.stack, - isAxiosError: error.isAxiosError, - requestUrl: error.isAxiosError ? error.request.url : null, - time: Date.now(), - }; - - await storage.tracking.upsertError("kbin", baseUrl, errorDetail); - - logging.error(`[KBinQueue] [${baseUrl}] Error: ${error.message}`, error); - } - - return false; - }; - - super(isWorker, queueName, processor); - } - - // use as KBinQueue.createJob({ baseUrl: "https://kbin.io" }); - async createJob(baseUrl: string, onSuccess: ISuccessCallback = null) { - await super.createJob(baseUrl, { baseUrl }, onSuccess); - } -} From 6422f201cd545049bd2869806c9b664be6661c41 Mon Sep 17 00:00:00 2001 From: tgxn Date: Tue, 7 Jan 2025 00:26:55 +0800 Subject: [PATCH 06/15] remove kbin refs --- crawler/src/bin/worker.ts | 2 +- crawler/src/crawl/instance.ts | 12 ++++++------ crawler/src/util/aged.ts | 1 - 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/crawler/src/bin/worker.ts b/crawler/src/bin/worker.ts index d4be44b..d8da754 100644 --- a/crawler/src/bin/worker.ts +++ b/crawler/src/bin/worker.ts @@ -72,7 +72,7 @@ export default async function startWorker(startWorkerName: string) { } // shares CRON_SCHEDULES.MBIN - logging.info("Creating KBin Cron Task", CRON_SCHEDULES.KBIN); + logging.info("Creating MBin Cron Task", CRON_SCHEDULES.MBIN); cron.schedule(CRON_SCHEDULES.MBIN, async (time) => { console.log("Running MBin Cron Task", time); await storage.connect(); diff --git a/crawler/src/crawl/instance.ts b/crawler/src/crawl/instance.ts index 20165fe..1639d18 100644 --- a/crawler/src/crawl/instance.ts +++ b/crawler/src/crawl/instance.ts @@ -19,13 +19,13 @@ import InstanceQueue from "../queue/instance"; import CrawlClient from "../lib/CrawlClient"; -import KBinQueue from "../queue/kbin"; +import MBinQueue from "../queue/mbin"; export default class InstanceCrawler { private crawlDomain: string; private logPrefix: string; - private kbinQueue: KBinQueue; + private mbinQueue: MBinQueue; private client: CrawlClient; @@ -33,7 +33,7 @@ export default class InstanceCrawler { this.crawlDomain = crawlDomain; this.logPrefix = `[Instance] [${this.crawlDomain}]`; - this.kbinQueue = new KBinQueue(false); + this.mbinQueue = new MBinQueue(false); this.client = new CrawlClient(); } @@ -110,9 +110,9 @@ export default class InstanceCrawler { await storage.fediverse.upsert(this.crawlDomain, nodeInfo.software); // scan kbin instances that are found - if (nodeInfo.software.name == "kbin") { - console.log(`${this.crawlDomain}: found kbin instance - creating job`); - await this.kbinQueue.createJob(this.crawlDomain); + if (nodeInfo.software.name == "mbin") { + console.log(`${this.crawlDomain}: found mbin instance - creating job`); + await this.mbinQueue.createJob(this.crawlDomain); } // only allow lemmy instances diff --git a/crawler/src/util/aged.ts b/crawler/src/util/aged.ts index eb4b19c..9f56c6f 100644 --- a/crawler/src/util/aged.ts +++ b/crawler/src/util/aged.ts @@ -3,7 +3,6 @@ import logging from "../lib/logging"; import InstanceQueue from "../queue/instance"; import CommunityQueue from "../queue/community_list"; import SingleCommunityQueue from "../queue/community_single"; -import KBinQueue from "../queue/kbin"; import storage from "../lib/crawlStorage"; From 20aaa79e69d6fbda5e7981ea6a523383e80d5aa9 Mon Sep 17 00:00:00 2001 From: tgxn Date: Tue, 7 Jan 2025 00:27:04 +0800 Subject: [PATCH 07/15] keep --- crawler/src/util/aged.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/crawler/src/util/aged.ts b/crawler/src/util/aged.ts index 9f56c6f..f5a04d7 100644 --- a/crawler/src/util/aged.ts +++ b/crawler/src/util/aged.ts @@ -3,6 +3,7 @@ import logging from "../lib/logging"; import InstanceQueue from "../queue/instance"; import CommunityQueue from "../queue/community_list"; import SingleCommunityQueue from "../queue/community_single"; +// import KBinQueue from "../queue/kbin"; import storage from "../lib/crawlStorage"; From cda3632f2c37243ff58bf22ba5ac2016c283ed5e Mon Sep 17 00:00:00 2001 From: tgxn Date: Tue, 7 Jan 2025 19:37:35 +0800 Subject: [PATCH 08/15] output script changes --- crawler/src/lib/const.ts | 2 + crawler/src/lib/crawlStorage.ts | 3 - crawler/src/lib/storage/kbin.ts | 66 -------- crawler/src/lib/storage/mbin.ts | 10 +- crawler/src/output/file_writer.ts | 258 +++++++++++++++++++++--------- crawler/src/output/output.ts | 214 +++++++++---------------- crawler/src/util/aged.ts | 41 ++++- 7 files changed, 304 insertions(+), 290 deletions(-) delete mode 100644 crawler/src/lib/storage/kbin.ts diff --git a/crawler/src/lib/const.ts b/crawler/src/lib/const.ts index 7c7fa1b..472abf2 100644 --- a/crawler/src/lib/const.ts +++ b/crawler/src/lib/const.ts @@ -38,6 +38,8 @@ export const CRAWL_AGED_TIME = { // if a server is identified as a non-lemmy server, ho often should we wait before checking again? FEDIVERSE: hoursToMs(2 * 24), // 2 days + + MAGAZINE: hoursToMs(8), }; // consider for deletion after they haven't been seen for this long diff --git a/crawler/src/lib/crawlStorage.ts b/crawler/src/lib/crawlStorage.ts index 776c6bc..5be2069 100644 --- a/crawler/src/lib/crawlStorage.ts +++ b/crawler/src/lib/crawlStorage.ts @@ -6,7 +6,6 @@ import logging from "./logging"; // core import InstanceStore from "./storage/instance"; import CommunityStore from "./storage/community"; -import KBinStore from "./storage/kbin"; import MBinStore from "./storage/mbin"; // supporting @@ -29,7 +28,6 @@ export class CrawlStorage { public fediverse: FediverseStore; public fediseer: FediseerStore; public tracking: TrackingStore; - public kbin: KBinStore; public mbin: MBinStore; constructor() { @@ -46,7 +44,6 @@ export class CrawlStorage { this.fediverse = new FediverseStore(this); this.fediseer = new FediseerStore(this); this.tracking = new TrackingStore(this); - this.kbin = new KBinStore(this); this.mbin = new MBinStore(this); } diff --git a/crawler/src/lib/storage/kbin.ts b/crawler/src/lib/storage/kbin.ts deleted file mode 100644 index 6f4cc2b..0000000 --- a/crawler/src/lib/storage/kbin.ts +++ /dev/null @@ -1,66 +0,0 @@ -import { CrawlStorage } from "../crawlStorage"; - -export type IMagazineData = { - baseUrl: string; - name: string; - description: string; - lastCrawled: number; - [key: string]: any; -}; - -export type IMagazineDataKeyValue = { - [key: string]: IMagazineData; -}; - -export default class KBinStore { - private storage: CrawlStorage; - - constructor(storage: CrawlStorage) { - this.storage = storage; - } - - async getAll(): Promise { - return this.storage.listRedis(`magazine:*`); - } - - async getAllWithKeys(): Promise { - return this.storage.listRedisWithKeys(`magazine:*`); - } - - async getOne(baseUrl: string, magazineName: string) { - return this.storage.getRedis(`magazine:${baseUrl}:${magazineName}`); - } - - async upsert(baseUrl: string, magazine: IMagazineData) { - const storeData = { - ...magazine, - lastCrawled: Date.now(), - }; - return this.storage.putRedis(`magazine:${baseUrl}:${magazine.name.toLowerCase()}`, storeData); - } - - async delete(baseUrl: string, magazineName: string, reason = "unknown") { - const oldRecord = await this.getOne(baseUrl, magazineName); - await this.storage.putRedis(`deleted:magazine:${baseUrl}:${magazineName}`, { - ...oldRecord, - deletedAt: Date.now(), - deleteReason: reason, - }); - - return this.storage.deleteRedis(`magazine:${baseUrl}:${magazineName}`); - } - - // use these to track magazine attributes over time - async setTrackedAttribute( - baseUrl: string, - magazineName: string, - attributeName: string, - attributeValue: string, - ) { - return await this.storage.redisZAdd( - `attributes:magazine:${baseUrl}:${magazineName}:${attributeName}`, - Date.now(), - attributeValue, - ); - } -} diff --git a/crawler/src/lib/storage/mbin.ts b/crawler/src/lib/storage/mbin.ts index 5a497d5..bd2bee4 100644 --- a/crawler/src/lib/storage/mbin.ts +++ b/crawler/src/lib/storage/mbin.ts @@ -37,6 +37,7 @@ export type IMagazineData = { serverSoftwareVersion: any; isPostingRestrictedToMods: boolean; lastCrawled?: number; + baseurl: string; }; export type IMagazineDataKeyValue = { [key: string]: IMagazineData; @@ -50,7 +51,14 @@ export default class MBinStore { } async getAll(): Promise { - return this.storage.listRedis(`mbin_magazine:*`); + const magazineKeyValue = this.storage.listRedisWithKeys(`mbin_magazine:*`); + + // put baseUrl into the magazine object + for (const key in magazineKeyValue) { + magazineKeyValue[key].baseurl = key.split(":")[1]; + } + + return Object.values(magazineKeyValue); } async getAllWithKeys(): Promise { diff --git a/crawler/src/output/file_writer.ts b/crawler/src/output/file_writer.ts index bc24023..402c804 100644 --- a/crawler/src/output/file_writer.ts +++ b/crawler/src/output/file_writer.ts @@ -1,5 +1,7 @@ import path from "node:path"; -import { open, rm, mkdir } from "node:fs/promises"; +import { open, rm, mkdir, FileHandle } from "node:fs/promises"; + +import { IUptimeNodeData, IFullUptimeData } from "../lib/storage/uptime"; /** * OutputFileWriter - This class handles writing the output JSON files. @@ -10,6 +12,139 @@ import { open, rm, mkdir } from "node:fs/promises"; // love you all +export type IMetaDataOutput = { + instances: number; + communities: number; + mbin_instances: number; // @ NEW + magazines: number; + fediverse: number; + + time: number; + package: string; + version: string; + + linked?: any; + allowed?: any; + blocked?: any; +}; + +export type IInstanceDataOutput = { + baseurl: string; + url: string; + name: string; + desc: string; + downvotes: boolean; + nsfw: boolean; + create_admin: boolean; + private: boolean; + fed: boolean; + version: string; + open: boolean; + usage: number; + counts: Object; + icon: string; + banner: string; + langs: string[]; + date: string; + published: number; + time: number; + score: number; + uptime?: IUptimeNodeData; + isSuspicious: boolean; + metrics: Object | null; + tags: string[]; + susReason: string[]; + trust: []; + blocks: { + incoming: number; + outgoing: number; + }; + blocked: string[]; +}; + +export type ICommunityDataOutput = { + baseurl: string; + url: string; + name: string; + title: string; + desc: string; + icon: string | null; + banner: string | null; + nsfw: boolean; + counts: Object; + published: number; + time: number; + isSuspicious: boolean; + score: number; +}; + +export type IMBinInstanceOutput = { + // actor_id: string; + // title: string; + // name: string; + // preferred: string; + // baseurl: string; + // summary: string; + // sensitive: boolean; + // postingRestrictedToMods: boolean; + // icon: string; + // published: string; + // updated: string; + // followers: number; + // time: number; +}; + +export type IMBinMagazineOutput = { + baseUrl: string; + magazineId: number; + title: string; + name: string; + description: string; + isAdult: boolean; + postingRestrictedToMods: boolean; + icon: string; + subscriptions: number; + posts: number; + time: number; +}; + +export type IFediverseDataOutput = { + url: string; + software: string; + version: string; +}; + +export type IClassifiedErrorOutput = { + baseurl: string; + time: number; + error: string; + type?: string; +}; + +// type IInstanceOutput = {}; + +// // minified version, only enough for sort/filter +// // { +// // "base": "lemmy.ml", +// // "title": "Lemmy!", +// // "name": "lemmy", +// // "desc": "lemmy instance is cool and stuff!", +// // "sort": { +// // "score": 724, //smart sort +// // "subscribers": 1, +// // "users": "users_active_week", +// // "posts": 0, +// // "comments": 0, +// // } +// // } +// type IInstanceMinOutput = {}; +// type IInstanceMetaOutput = {}; + +// type ICommunityOutput = {}; +// type ICommunityMinOutput = {}; + +// type IMagazineOutput = {}; + // split communities.json and instances.json into smaller files for easier loading // community-index.json @@ -40,21 +175,6 @@ import { open, rm, mkdir } from "node:fs/promises"; // "score": 724 // } -// minified version, only enough for sort/filter -// { -// "base": "lemmy.ml", -// "title": "Lemmy!", -// "name": "lemmy", -// "desc": "lemmy instance is cool and stuff!", -// "sort": { -// "score": 724, //smart sort -// "subscribers": 1, -// "users": "users_active_week", -// "posts": 0, -// "comments": 0, -// } -// } - // instance-index.json // should do all the things needed to transform the redis data into data for frontend @@ -105,48 +225,10 @@ export default class OutputFileWriter { } } - /** - * this method is used to split the data into smaller files for easier loading - * - * @param {string} chunkName - the name of the chunk, used for the filename - * @param {number} perFile - how many entries per file - * @param {array} dataArray - the data array to split - */ - async storeChunkedData(chunkName: string, perFile: number, dataArray: any) { - await this.writeJsonFile(`${this.publicDataFolder}/${chunkName}.full.json`, JSON.stringify(dataArray)); - - // mapped versions and the metadata - await mkdir(path.join(this.publicDataFolder, chunkName), { - recursive: true, - }); - - let fileCount = 0; - for (let i = 0; i < dataArray.length; i += perFile) { - let chunk = dataArray.slice(i, i + perFile); - - await this.writeJsonFile( - `${this.publicDataFolder}/${chunkName}/${fileCount}.json`, - JSON.stringify(chunk), - ); - fileCount++; - } - - await this.writeJsonFile( - `${this.publicDataFolder}/${chunkName}.json`, - JSON.stringify({ - count: fileCount, - }), - ); - } - /** * this method is used to store the fediverse data - * - * @param {object} data - the fediverse data - * @param {object} softwareData - the fediverse software data - * @param {object} softwareBaseUrls - the fediverse software base urls */ - async storeFediverseData(data: any, softwareData: any, softwareBaseUrls: any, fediTags: any) { + public async storeFediverseData(data: any, softwareData: any, softwareBaseUrls: any, fediTags: any) { await this.writeJsonFile(`${this.publicDataFolder}/fediverse.json`, JSON.stringify(data)); await this.writeJsonFile( `${this.publicDataFolder}/fediverse_software_counts.json`, @@ -163,11 +245,8 @@ export default class OutputFileWriter { /** * this method is used to store the instance metrics data - * - * @param {string} instanceBaseUrl - the base url of the instance - * @param {object} data - the instance metrics data */ - async storeInstanceMetricsData(instanceBaseUrl: String, data: any) { + public async storeInstanceMetricsData(instanceBaseUrl: String, data: any) { await mkdir(this.metricsPath, { recursive: true, }); @@ -177,11 +256,9 @@ export default class OutputFileWriter { /** * this method is used to store the community metrics data - * - * @param {string} instanceBaseUrl - the base url of the instance - * @param {object} data - the instance metrics data */ - async storeCommunityMetricsData(instanceBaseUrl: string, communityData: any) { + public async storeCommunityMetricsData(instanceBaseUrl: string, communityData: any) { + // make sure the directory exists for the instance await mkdir(`${this.communityMetricsPath}/${instanceBaseUrl}`, { recursive: true, }); @@ -192,35 +269,72 @@ export default class OutputFileWriter { ); } - async storeMetaData(data: any) { + public async storeMetaData(data: IMetaDataOutput) { await this.writeJsonFile(`${this.publicDataFolder}/meta.json`, JSON.stringify(data)); } - async storeInstanceErrors(data: any) { + public async storeInstanceErrors(data: any) { await this.writeJsonFile(`${this.publicDataFolder}/instanceErrors.json`, JSON.stringify(data)); } - async storeSuspicousData(data: any) { + public async storeSuspicousData(data: any) { await this.writeJsonFile(`${this.publicDataFolder}/sus.json`, JSON.stringify(data)); } - async storeKbinInstanceList(data: any) { + // stores an array of the string baseUrl + public async storeMBinInstanceData(data: string[]) { await this.writeJsonFile(`${this.publicDataFolder}/kbin.min.json`, JSON.stringify(data)); } - async storeKBinMagazineData(data: any) { + public async storeMBinMagazineData(data: any) { await this.storeChunkedData("magazines", this.magazinesPerFile, data); } - async cleanData() { + /** + * this method is used to clean (delete all files) the data folder + */ + public async cleanData(): Promise { await rm(this.publicDataFolder, { recursive: true, force: true }); await mkdir(this.publicDataFolder, { recursive: true }); } - async writeJsonFile(filename: string, data: any) { - let filehandle: any = null; + /** + * this method is used to split the data into smaller files for easier loading + */ + private async storeChunkedData(chunkName: string, perFile: number, dataArray: []): Promise { + await this.writeJsonFile(`${this.publicDataFolder}/${chunkName}.full.json`, JSON.stringify(dataArray)); + + // mapped versions and the metadata + await mkdir(path.join(this.publicDataFolder, chunkName), { + recursive: true, + }); + + let fileCount = 0; + for (let i = 0; i < dataArray.length; i += perFile) { + let chunk = dataArray.slice(i, i + perFile); + + await this.writeJsonFile( + `${this.publicDataFolder}/${chunkName}/${fileCount}.json`, + JSON.stringify(chunk), + ); + fileCount++; + } + + await this.writeJsonFile( + `${this.publicDataFolder}/${chunkName}.json`, + JSON.stringify({ + count: fileCount, + }), + ); + } + + /** + * this method is used to write a JSON file + */ + private async writeJsonFile(fileName: string, data: string): Promise { + let filehandle: FileHandle | null = null; try { - filehandle = await open(filename, "w"); + filehandle = await open(fileName, "w"); await filehandle?.writeFile(data); } finally { diff --git a/crawler/src/output/output.ts b/crawler/src/output/output.ts index edb5cc4..f9dd6fb 100644 --- a/crawler/src/output/output.ts +++ b/crawler/src/output/output.ts @@ -10,9 +10,10 @@ import CrawlClient from "../lib/CrawlClient"; import storage from "../lib/crawlStorage"; import { IInstanceData, IInstanceDataKeyValue } from "../lib/storage/instance"; import { ICommunityData, ICommunityDataKeyValue } from "../lib/storage/community"; -import { IMagazineData, IMagazineDataKeyValue } from "../lib/storage/kbin"; +import { IMagazineData, IMagazineDataKeyValue } from "../lib/storage/mbin"; import { IFediverseData, IFediverseDataKeyValue } from "../lib/storage/fediverse"; // import { IFediseerInstanceData } from "../lib/storage/fediseer"; + import { IErrorData, IErrorDataKeyValue, @@ -21,88 +22,17 @@ import { } from "../lib/storage/tracking"; import { IUptimeNodeData, IFullUptimeData } from "../lib/storage/uptime"; -import OutputFileWriter from "./file_writer"; +import OutputFileWriter, { + IMetaDataOutput, + IInstanceDataOutput, + ICommunityDataOutput, + IMBinInstanceOutput, + IMBinMagazineOutput, + IFediverseDataOutput, + IClassifiedErrorOutput, +} from "./file_writer"; import OutputTrust from "./trust"; -export type IKBinMagazineOutput = { - actor_id: string; - title: string; - name: string; - preferred: string; - baseurl: string; - summary: string; - sensitive: boolean; - postingRestrictedToMods: boolean; - icon: string; - published: string; - updated: string; - followers: number; - time: number; -}; - -export type IFediverseDataOutput = { - url: string; - software: string; - version: string; -}; - -export type IClassifiedErrorOutput = { - baseurl: string; - time: number; - error: string; - type?: string; -}; - -export type ICommunityDataOutput = { - baseurl: string; - url: string; - name: string; - title: string; - desc: string; - icon: string | null; - banner: string | null; - nsfw: boolean; - counts: Object; - published: number; - time: number; - isSuspicious: boolean; - score: number; -}; - -export type IInstanceDataOutput = { - baseurl: string; - url: string; - name: string; - desc: string; - downvotes: boolean; - nsfw: boolean; - create_admin: boolean; - private: boolean; - fed: boolean; - version: string; - open: boolean; - usage: number; - counts: Object; - icon: string; - banner: string; - langs: string[]; - date: string; - published: number; - time: number; - score: number; - uptime?: IUptimeNodeData; - isSuspicious: boolean; - metrics: Object | null; - tags: string[]; - susReason: string[]; - trust: []; - blocks: { - incoming: number; - outgoing: number; - }; - blocked: string[]; -}; - class OutputUtils { // strip markdown, optionally substring static stripMarkdownSubStr(text: string, maxLength: number = -1) { @@ -225,8 +155,8 @@ class OutputUtils { previousRun, returnInstanceArray: IInstanceDataOutput[], returnCommunityArray: ICommunityDataOutput[], - kbinInstanceArray: string[], - kbinMagazineArray: IKBinMagazineOutput[], + mbinInstanceArray: string[], + mbinMagazineArray: IMBinMagazineOutput[], returnStats: IFediverseDataOutput[], ) { const issues: string[] = []; @@ -235,8 +165,8 @@ class OutputUtils { if ( returnInstanceArray.length === 0 || returnCommunityArray.length === 0 || - kbinInstanceArray.length === 0 || - kbinMagazineArray.length === 0 || + mbinInstanceArray.length === 0 || + mbinMagazineArray.length === 0 || returnStats.length === 0 ) { console.log("Empty Array"); @@ -305,16 +235,16 @@ class OutputUtils { old: previousRun.fediverse, }); - // @TODO kbin checks are disabled till scanning is fixed + // @TODO mbin checks are disabled till scanning is fixed // data.push({ // type: "magazines", - // new: kbinMagazineArray.length, + // new: mbinMagazineArray.length, // old: previousRun.magazines, // }); // data.push({ - // type: "kbin_instances", - // new: kbinInstanceArray.length, - // old: previousRun.kbin_instances, + // type: "mbin_instances", + // new: mbinInstanceArray.length, + // old: previousRun.mbin_instances, // }); for (let i = 0; i < data.length; i++) { @@ -354,7 +284,7 @@ export default class CrawlOutput { private instanceList: IInstanceData[] | null; private communityList: ICommunityData[] | null; private fediverseData: IFediverseDataKeyValue | null; - private kbinData: IMagazineData[] | null; + private mbinData: IMagazineData[] | null; private fileWriter: OutputFileWriter; private trust: OutputTrust; @@ -366,7 +296,7 @@ export default class CrawlOutput { this.instanceList = null; this.communityList = null; this.fediverseData = null; - this.kbinData = null; + this.mbinData = null; // this.utils = new OutputUtils(); @@ -382,7 +312,7 @@ export default class CrawlOutput { this.instanceList = await storage.instance.getAll(); this.communityList = await storage.community.getAll(); this.fediverseData = await storage.fediverse.getAll(); - this.kbinData = await storage.kbin.getAll(); + this.mbinData = await storage.mbin.getAll(); } /** @@ -403,8 +333,8 @@ export default class CrawlOutput { throw new Error("No fediverse Data"); } - if (!this.kbinData) { - throw new Error("No kbin Data"); + if (!this.mbinData) { + throw new Error("No mbin Data"); } // setup trust data @@ -447,9 +377,9 @@ export default class CrawlOutput { // fediverse data const returnStats = await this.outputFediverseData(returnInstanceArray); - // kbin data - const kbinInstanceArray = await this.outputKBinInstanceList(returnStats); - const kbinMagazineArray = await this.outputKBinMagazineList(); + // mbin data + const mbinInstanceArray = await this.outputMBinInstanceList(returnStats); + const mbinMagazineArray = await this.outputMBinMagazineList(); // error data const instanceErrors = await this.outputClassifiedErrors(); @@ -459,12 +389,11 @@ export default class CrawlOutput { (await readFile(new URL("../../package.json", import.meta.url))).toString(), ); - const metaData = { + const metaData: IMetaDataOutput = { instances: returnInstanceArray.length, communities: returnCommunityArray.length, - kbin_instances: kbinInstanceArray.length, - magazines: kbinMagazineArray.length, - // kbin_instances: kbinInstanceArray.length, + mbin_instances: mbinInstanceArray.length, + magazines: mbinMagazineArray.length, fediverse: returnStats.length, time: Date.now(), package: packageJson.name, @@ -508,20 +437,22 @@ export default class CrawlOutput { Previous: previousRun.communities, Change: calcChangeDisplay(returnCommunityArray.length, previousRun.communities), }, - KBinInstances: { - ExportName: "KBin Instances", + + MBinInstances: { + ExportName: "MBin Instances", Total: "N/A", - Output: kbinInstanceArray.length, - Previous: previousRun.kbin_instances, - Change: calcChangeDisplay(kbinInstanceArray.length, previousRun.kbin_instances), + Output: mbinInstanceArray.length, + Previous: previousRun.mbin_instances, + Change: calcChangeDisplay(mbinInstanceArray.length, previousRun.mbin_instances), }, Magazines: { ExportName: "Magazines", - Total: this.kbinData.length, - Output: kbinMagazineArray.length, + Total: this.mbinData.length, + Output: mbinMagazineArray.length, Previous: previousRun.magazines, - Change: calcChangeDisplay(kbinMagazineArray.length, previousRun.magazines), + Change: calcChangeDisplay(mbinMagazineArray.length, previousRun.magazines), }, + Fediverse: { ExportName: "Fediverse Servers", Total: "N/A", @@ -547,8 +478,8 @@ export default class CrawlOutput { previousRun, returnInstanceArray, returnCommunityArray, - kbinInstanceArray, - kbinMagazineArray, + mbinInstanceArray, + mbinMagazineArray, returnStats, ); @@ -1023,14 +954,14 @@ export default class CrawlOutput { return returnStats; } - // KBIN + // mbin - private async outputKBinInstanceList(returnStats: IFediverseDataOutput[]): Promise { - let kbinInstanceUrls: string[] = returnStats + private async outputMBinInstanceList(returnStats: IFediverseDataOutput[]): Promise { + let mbinInstanceUrls: string[] = returnStats .map((fediverse) => { // const fediverse = this.fediverseData[fediKey]; - if (fediverse.software && fediverse.software === "kbin") { + if (fediverse.software && fediverse.software === "mbin") { return fediverse.url; } @@ -1038,50 +969,51 @@ export default class CrawlOutput { }) .filter((instance) => instance !== null); - await this.fileWriter.storeKbinInstanceList(kbinInstanceUrls); + await this.fileWriter.storeMBinInstanceData(mbinInstanceUrls); - return kbinInstanceUrls; + return mbinInstanceUrls; } // generate a list of all the instances that are suspicious and the reasons - private async outputKBinMagazineList(): Promise { - const output: IKBinMagazineOutput[] = []; + private async outputMBinMagazineList(): Promise { + const output: IMBinMagazineOutput[] = []; - if (!this.kbinData) { - throw new Error("No KBin data"); + if (!this.mbinData) { + throw new Error("No MBin data"); } // filter old data - const filteredKBins = this.kbinData.filter((kbin) => { - return kbin.lastCrawled > Date.now() - OUTPUT_MAX_AGE.MAGAZINE; + const filteredMBins = this.mbinData.filter((mbin) => { + if (!mbin.lastCrawled) return false; // record needs time + return mbin.lastCrawled > Date.now() - OUTPUT_MAX_AGE.MAGAZINE; }); - logging.info("KBin Magazines filteredKBins", this.kbinData.length, filteredKBins.length); + logging.info("MBin Magazines filteredMBins", this.mbinData.length, filteredMBins.length); - for (const kbin of filteredKBins) { + for (const mbin of filteredMBins) { output.push({ - actor_id: kbin.id, - - title: kbin.title, // display name - name: kbin.name, // key username - preferred: kbin.preferredUsername, // username ?? + baseUrl: mbin.baseurl, + magazineId: mbin.magazineId, - baseurl: kbin.id.split("/")[2], + title: mbin.title, // display name + name: mbin.name, // key username + // preferred: mbin.preferredUsername, // username ?? - summary: OutputUtils.stripMarkdownSubStr(kbin.summary, 350), - sensitive: kbin.sensitive, - postingRestrictedToMods: kbin.postingRestrictedToMods, + description: OutputUtils.stripMarkdownSubStr(mbin.description, 350), + isAdult: mbin.isAdult, + postingRestrictedToMods: mbin.isPostingRestrictedToMods, - icon: kbin.icon ? kbin.icon.url : null, - published: kbin.published, - updated: kbin.updated, - followers: kbin.followerCount, + icon: mbin.icon ? mbin.icon.url : null, + // published: mbin.published, + // updated: mbin.updated, + subscriptions: mbin.subscriptionsCount, + posts: mbin.postCount, - time: kbin.lastCrawled || 0, + time: mbin.lastCrawled || 0, }); } - await this.fileWriter.storeKBinMagazineData(output); + await this.fileWriter.storeMBinMagazineData(output); return output; } diff --git a/crawler/src/util/aged.ts b/crawler/src/util/aged.ts index f5a04d7..e2d6526 100644 --- a/crawler/src/util/aged.ts +++ b/crawler/src/util/aged.ts @@ -3,7 +3,7 @@ import logging from "../lib/logging"; import InstanceQueue from "../queue/instance"; import CommunityQueue from "../queue/community_list"; import SingleCommunityQueue from "../queue/community_single"; -// import KBinQueue from "../queue/kbin"; +import MBinQueue from "../queue/mbin"; import storage from "../lib/crawlStorage"; @@ -15,7 +15,7 @@ export default class CrawlAged { private instanceCrawler: InstanceQueue; private communityCrawler: CommunityQueue; private singleCommunityCrawler: SingleCommunityQueue; - // private kbinCrawler: KBinQueue; + private mbinCrawler: MBinQueue; constructor() { this.agedInstanceBaseUrls = []; @@ -24,8 +24,8 @@ export default class CrawlAged { this.communityCrawler = new CommunityQueue(false); this.singleCommunityCrawler = new SingleCommunityQueue(false); - // @TODO scan for aged kbin magazines - // this.kbinCrawler = new KBinQueue(false); + // scan for aged magazines + this.mbinCrawler = new MBinQueue(false); } async recordAges() { @@ -33,7 +33,7 @@ export default class CrawlAged { const instances = await storage.instance.getAll(); const communities = await storage.community.getAll(); - const magazines = await storage.kbin.getAll(); + const magazines = await storage.mbin.getAll(); const fediverse = await storage.fediverse.getAll(); const errors = await storage.tracking.getAllErrors("*"); const lastCrawls = await storage.tracking.listAllLastCrawl(); @@ -219,7 +219,7 @@ export default class CrawlAged { return false; }); - logging.info("Aged Communities By Base", Object.keys(byBase).length); + logging.info("Aged Communities By Base (showing over 100 communities)", Object.keys(byBase).length); const baseCounts = Object.keys(byBase) .map((base) => { @@ -228,7 +228,8 @@ export default class CrawlAged { count: byBase[base].length, }; }) - .sort((a, b) => a.count - b.count); + .sort((a, b) => a.count - b.count) + .filter((a) => a.count > 100); console.table(baseCounts); @@ -252,6 +253,32 @@ export default class CrawlAged { await this.communityCrawler.createJob(baseUrl); } + // get aged magazines + const magazines = await storage.mbin.getAll(); + + const agedMagazines = Object.values(magazines).filter((magazine) => { + if (!magazine.lastCrawled) return true; // not set + + if (Date.now() - magazine.lastCrawled > CRAWL_AGED_TIME.MAGAZINE) { + return true; + } + + return false; + }); + + // get base url for each magazine + const agedMagazineBaseUrls = agedMagazines.map((magazine) => magazine.baseurl); + + // filter those dupes + const uniqueAgedMagazineBaseUrls = [...new Set(agedMagazineBaseUrls)]; + + logging.info( + `Magazines Total: ${magazines.length} Aged ${agedMagazineBaseUrls.length}, Total Instances: ${uniqueAgedMagazineBaseUrls.length}`, + ); + for (const baseUrl of uniqueAgedMagazineBaseUrls) { + this.mbinCrawler.createJob(baseUrl); + } + logging.info("Done Creating Aged Jobs"); } } From f3d74da8385fc807c0209b62744e7c5c8c15eabf Mon Sep 17 00:00:00 2001 From: tgxn Date: Tue, 7 Jan 2025 19:45:16 +0800 Subject: [PATCH 09/15] finish removing kbin --- crawler/src/bin/worker.ts | 4 ++-- crawler/src/crawl/instance.ts | 4 ++-- crawler/src/crawl/mbin.ts | 8 ++++---- crawler/src/lib/const.ts | 2 +- crawler/src/output/file_writer.ts | 2 +- crawler/src/output/trust.ts | 2 +- crawler/src/queue/BaseQueue.ts | 2 +- crawler/src/queue/mbin.ts | 2 +- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/crawler/src/bin/worker.ts b/crawler/src/bin/worker.ts index d8da754..95fb60b 100644 --- a/crawler/src/bin/worker.ts +++ b/crawler/src/bin/worker.ts @@ -77,8 +77,8 @@ export default async function startWorker(startWorkerName: string) { console.log("Running MBin Cron Task", time); await storage.connect(); - const kbinScan = new CrawlMBin(); - await kbinScan.createJobsAllMBin(); + const mbinScan = new CrawlMBin(); + await mbinScan.createJobsAllMBin(); await storage.close(); }); diff --git a/crawler/src/crawl/instance.ts b/crawler/src/crawl/instance.ts index 1639d18..da5e8b5 100644 --- a/crawler/src/crawl/instance.ts +++ b/crawler/src/crawl/instance.ts @@ -109,7 +109,7 @@ export default class InstanceCrawler { // store all fediverse instance software for easy metrics await storage.fediverse.upsert(this.crawlDomain, nodeInfo.software); - // scan kbin instances that are found + // scan mbin instances that are found if (nodeInfo.software.name == "mbin") { console.log(`${this.crawlDomain}: found mbin instance - creating job`); await this.mbinQueue.createJob(this.crawlDomain); @@ -325,7 +325,7 @@ export const instanceProcessor: IJobProcessor = async ({ baseUrl }) => { if ( knownFediverseServer.name !== "lemmy" && knownFediverseServer.name !== "lemmybb" && - knownFediverseServer.name !== "kbin" && + // knownFediverseServer.name !== "mbin" && knownFediverseServer.time && Date.now() - knownFediverseServer.time < CRAWL_AGED_TIME.FEDIVERSE // re-scan fedi servers to check their status ) { diff --git a/crawler/src/crawl/mbin.ts b/crawler/src/crawl/mbin.ts index 73a1975..19f904e 100644 --- a/crawler/src/crawl/mbin.ts +++ b/crawler/src/crawl/mbin.ts @@ -87,10 +87,10 @@ export default class CrawlMBin { * - `/api/magazine/{magazine_id}` to get magazine info */ - // scan the full list of fediverse marked instances with "kbin" + // scan the full list of fediverse marked instances with "mbin" async createJobsAllMBin() { try { - // get all fedi kbin servers + // get all fedi mbin servers const mbinServers = await this.getInstances(); logging.info(`MBin Instances Total: ${mbinServers.length}`); @@ -102,7 +102,7 @@ export default class CrawlMBin { await mbinQueue.createJob(mbinServer.base); } } catch (e) { - console.error(`${this.logPrefix} error scanning kbin instance`, e); + console.error(`${this.logPrefix} error scanning mbin instance`, e); } } @@ -338,7 +338,7 @@ export const mbinInstanceProcessor: IJobProcessor = async ({ baseUrl }) => { const startTime = Date.now(); try { - // check for recent scan of this KBIN instance + // check for recent scan of this kbin instance const lastCrawl = await storage.tracking.getLastCrawl("mbin", baseUrl); if (lastCrawl) { const lastCrawledMsAgo = Date.now() - lastCrawl.time; diff --git a/crawler/src/lib/const.ts b/crawler/src/lib/const.ts index 472abf2..429b968 100644 --- a/crawler/src/lib/const.ts +++ b/crawler/src/lib/const.ts @@ -21,7 +21,7 @@ export const PUBLISH_S3_BUCKET = process.env.PUBLISH_S3_BUCKET || "lemmyexplorer export const CRAWL_TIMEOUT = { INSTANCE: 30 * 60 * 1000, // 30 mins COMMUNITY: 120 * 60 * 1000, // 2 hours - KBIN: 60 * 60 * 1000, // one hour + MBIN: 60 * 60 * 1000, // one hour }; // max age to be included in output diff --git a/crawler/src/output/file_writer.ts b/crawler/src/output/file_writer.ts index 402c804..6ad3b6f 100644 --- a/crawler/src/output/file_writer.ts +++ b/crawler/src/output/file_writer.ts @@ -283,7 +283,7 @@ export default class OutputFileWriter { // stores an array of the string baseUrl public async storeMBinInstanceData(data: string[]) { - await this.writeJsonFile(`${this.publicDataFolder}/kbin.min.json`, JSON.stringify(data)); + await this.writeJsonFile(`${this.publicDataFolder}/mbin.min.json`, JSON.stringify(data)); } public async storeMBinMagazineData(data: any) { diff --git a/crawler/src/output/trust.ts b/crawler/src/output/trust.ts index 3e8cac0..f29b07f 100644 --- a/crawler/src/output/trust.ts +++ b/crawler/src/output/trust.ts @@ -3,7 +3,7 @@ import divinator from "divinator"; import storage from "../lib/crawlStorage"; import { IInstanceData, IInstanceDataKeyValue } from "../lib/storage/instance"; import { ICommunityData, ICommunityDataKeyValue } from "../lib/storage/community"; -import { IMagazineData, IMagazineDataKeyValue } from "../lib/storage/kbin"; +import { IMagazineData, IMagazineDataKeyValue } from "../lib/storage/mbin"; import { IFediverseData, IFediverseDataKeyValue } from "../lib/storage/fediverse"; import { IFediseerInstanceData } from "../lib/storage/fediseer"; import { diff --git a/crawler/src/queue/BaseQueue.ts b/crawler/src/queue/BaseQueue.ts index 6ee7285..9e8a5c5 100644 --- a/crawler/src/queue/BaseQueue.ts +++ b/crawler/src/queue/BaseQueue.ts @@ -52,7 +52,7 @@ export default class BaseQueue { onSuccess && onSuccess(result); }); - await job.timeout(CRAWL_TIMEOUT.KBIN).setId(jobId).save(); + await job.timeout(CRAWL_TIMEOUT.MBIN).setId(jobId).save(); } process() { diff --git a/crawler/src/queue/mbin.ts b/crawler/src/queue/mbin.ts index 700ec2f..7280662 100644 --- a/crawler/src/queue/mbin.ts +++ b/crawler/src/queue/mbin.ts @@ -12,7 +12,7 @@ export default class MBinQueue extends BaseQueue { super(isWorker, queueName, mbinInstanceProcessor); } - // use as KBinQueue.createJob({ baseUrl: "https://kbin.io" }); + // use as MBinQueue.createJob({ baseUrl: "https://fedia.io" }); async createJob(baseUrl: string, onSuccess: ISuccessCallback | null = null) { await super.createJob(baseUrl, { baseUrl }, onSuccess); } From c43989619e20e393c9c3be13253a8a23c543c194 Mon Sep 17 00:00:00 2001 From: tgxn Date: Tue, 7 Jan 2025 21:39:23 +0800 Subject: [PATCH 10/15] output script etc --- crawler/src/crawl/mbin.ts | 9 ++++++++- crawler/src/lib/storage/mbin.ts | 20 +++++++++++++++----- crawler/src/output/output.ts | 1 - crawler/src/util/aged.ts | 2 ++ 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/crawler/src/crawl/mbin.ts b/crawler/src/crawl/mbin.ts index 19f904e..86c8b11 100644 --- a/crawler/src/crawl/mbin.ts +++ b/crawler/src/crawl/mbin.ts @@ -6,6 +6,7 @@ import logging from "../lib/logging"; import storage from "../lib/crawlStorage"; import { IFediverseDataKeyValue } from "../lib/storage/fediverse"; +import { IMagazineData } from "../lib/storage/mbin"; import { CrawlError, CrawlTooRecentError } from "../lib/error"; @@ -307,7 +308,13 @@ export default class CrawlMBin { // validate the community is for the domain being scanned, and save it async storeMagazineData(crawlDomain: string, magazineData: IIncomingMagazineData) { - await storage.mbin.upsert(crawlDomain, magazineData); + const outMagazineData: IMagazineData = { + baseurl: crawlDomain, + ...magazineData, + lastCrawled: Date.now(), + }; + + await storage.mbin.upsert(crawlDomain, outMagazineData); await storage.mbin.setTrackedAttribute( crawlDomain, diff --git a/crawler/src/lib/storage/mbin.ts b/crawler/src/lib/storage/mbin.ts index bd2bee4..f7b1313 100644 --- a/crawler/src/lib/storage/mbin.ts +++ b/crawler/src/lib/storage/mbin.ts @@ -9,7 +9,10 @@ export type IMagazineData = { username: string; apId: any; }; - icon: any; + icon: { + storageUrl: string; + [key: string]: any; + }; name: string; title: string; description: string; @@ -22,12 +25,19 @@ export type IMagazineData = { isAdult: boolean; isUserSubscribed: any; isBlockedByUser: any; - tags: any; - badges: any[]; + tags: string[]; + badges: { + badgeId: number; + magazineId: number; + name: string; + }[]; moderators: { magazineId: number; userId: number; - avatar: any; + avatar: { + storageUrl: string; + [key: string]: any; + }; username: string; apId: any; }[]; @@ -51,7 +61,7 @@ export default class MBinStore { } async getAll(): Promise { - const magazineKeyValue = this.storage.listRedisWithKeys(`mbin_magazine:*`); + const magazineKeyValue = await this.storage.listRedisWithKeys(`mbin_magazine:*`); // put baseUrl into the magazine object for (const key in magazineKeyValue) { diff --git a/crawler/src/output/output.ts b/crawler/src/output/output.ts index f9dd6fb..0551cf6 100644 --- a/crawler/src/output/output.ts +++ b/crawler/src/output/output.ts @@ -974,7 +974,6 @@ export default class CrawlOutput { return mbinInstanceUrls; } - // generate a list of all the instances that are suspicious and the reasons private async outputMBinMagazineList(): Promise { const output: IMBinMagazineOutput[] = []; diff --git a/crawler/src/util/aged.ts b/crawler/src/util/aged.ts index e2d6526..276beaa 100644 --- a/crawler/src/util/aged.ts +++ b/crawler/src/util/aged.ts @@ -38,6 +38,8 @@ export default class CrawlAged { const errors = await storage.tracking.getAllErrors("*"); const lastCrawls = await storage.tracking.listAllLastCrawl(); + console.log("Record Counts", magazines.length); + const healthData: any = []; // get age distribution From d48647ccd370d52cc1b2fb9511efb9a45b00830c Mon Sep 17 00:00:00 2001 From: tgxn Date: Wed, 8 Jan 2025 00:01:08 +0800 Subject: [PATCH 11/15] wok on kbin --- crawler/src/crawl/fediseer.ts | 3 +- crawler/src/crawl/instance.ts | 2 +- crawler/src/crawl/mbin.ts | 6 +- crawler/src/lib/storage/community.ts | 30 +-- crawler/src/lib/storage/fediseer.ts | 39 +--- crawler/src/lib/storage/fediverse.ts | 13 +- crawler/src/lib/storage/instance.ts | 14 +- crawler/src/lib/storage/mbin.ts | 53 +---- crawler/src/lib/storage/tracking.ts | 30 +-- crawler/src/lib/storage/uptime.ts | 17 +- crawler/src/output/file_writer.ts | 119 +---------- crawler/src/output/output.ts | 23 +- crawler/src/output/trust.ts | 14 +- frontend/src/App.tsx | 4 +- .../GridView/{KBin.tsx => MBin.tsx} | 2 +- .../GridView/{KBinCard.tsx => MBinCard.tsx} | 27 ++- .../src/components/Header/HeaderSideMenu.tsx | 9 +- frontend/src/components/Header/KBinIcon.tsx | 4 +- frontend/src/components/Header/MBinIcon.tsx | 77 +++++++ .../components/Header/SelectHomeInstance.tsx | 2 +- .../ListView/{KBin.tsx => MBin.tsx} | 10 +- .../{KBinMagazines.tsx => MBinMagazines.tsx} | 58 ++--- pages/src/App.tsx | 8 +- types/output.ts | 110 ++++++++++ types/storage.ts | 202 ++++++++++++++++++ 25 files changed, 508 insertions(+), 368 deletions(-) rename frontend/src/components/GridView/{KBin.tsx => MBin.tsx} (96%) rename frontend/src/components/GridView/{KBinCard.tsx => MBinCard.tsx} (89%) create mode 100644 frontend/src/components/Header/MBinIcon.tsx rename frontend/src/components/ListView/{KBin.tsx => MBin.tsx} (93%) rename frontend/src/pages/{KBinMagazines.tsx => MBinMagazines.tsx} (86%) create mode 100644 types/output.ts create mode 100644 types/storage.ts diff --git a/crawler/src/crawl/fediseer.ts b/crawler/src/crawl/fediseer.ts index f603d41..12ddfae 100644 --- a/crawler/src/crawl/fediseer.ts +++ b/crawler/src/crawl/fediseer.ts @@ -1,6 +1,7 @@ import logging from "../lib/logging"; import storage from "../lib/crawlStorage"; -import { IFediseerInstanceData, IFediseerTag } from "../lib/storage/fediseer"; + +import { IFediseerInstanceData, IFediseerTag } from "../../../types/storage"; import CrawlClient from "../lib/CrawlClient"; diff --git a/crawler/src/crawl/instance.ts b/crawler/src/crawl/instance.ts index da5e8b5..42da880 100644 --- a/crawler/src/crawl/instance.ts +++ b/crawler/src/crawl/instance.ts @@ -5,7 +5,7 @@ import { IErrorDataKeyValue, ILastCrawlData, ILastCrawlDataKeyValue, -} from "../lib/storage/tracking"; +} from "../../../types/storage"; import { CRAWL_AGED_TIME } from "../lib/const"; import { HTTPError, CrawlError, CrawlTooRecentError } from "../lib/error"; diff --git a/crawler/src/crawl/mbin.ts b/crawler/src/crawl/mbin.ts index 86c8b11..f6ff546 100644 --- a/crawler/src/crawl/mbin.ts +++ b/crawler/src/crawl/mbin.ts @@ -5,8 +5,8 @@ import { exec } from "node:child_process"; import logging from "../lib/logging"; import storage from "../lib/crawlStorage"; -import { IFediverseDataKeyValue } from "../lib/storage/fediverse"; -import { IMagazineData } from "../lib/storage/mbin"; +import { IFediverseDataKeyValue } from "../../../types/storage"; +import { IMagazineData } from "../../../types/storage"; import { CrawlError, CrawlTooRecentError } from "../lib/error"; @@ -311,6 +311,8 @@ export default class CrawlMBin { const outMagazineData: IMagazineData = { baseurl: crawlDomain, ...magazineData, + + icon: magazineData.icon.storageUrl ? magazineData.icon.storageUrl : null, lastCrawled: Date.now(), }; diff --git a/crawler/src/lib/storage/community.ts b/crawler/src/lib/storage/community.ts index d7bb30e..ce384d6 100644 --- a/crawler/src/lib/storage/community.ts +++ b/crawler/src/lib/storage/community.ts @@ -1,34 +1,6 @@ import { CrawlStorage } from "../crawlStorage"; -export type ICommunityData = { - community: { - id: number; - name: string; - title: string; - description: string; - removed: boolean; - published: string; - updated: string | null; - deleted: boolean; - nsfw: boolean; - actor_id: string; - local: boolean; - icon: string | null; - banner: string | null; - hidden: boolean; - posting_restricted_to_mods: boolean; - instance_id: number; - }; - subscribed: string; - blocked: boolean; - counts: Object; - banned_from_community?: boolean; - lastCrawled: number; -}; - -export type ICommunityDataKeyValue = { - [key: string]: ICommunityData; -}; +import { ICommunityData, ICommunityDataKeyValue } from "../../../../types/storage"; export default class Community { private storage: CrawlStorage; diff --git a/crawler/src/lib/storage/fediseer.ts b/crawler/src/lib/storage/fediseer.ts index e4870ab..756a2e8 100644 --- a/crawler/src/lib/storage/fediseer.ts +++ b/crawler/src/lib/storage/fediseer.ts @@ -1,43 +1,6 @@ import { CrawlStorage } from "../crawlStorage"; -export type IFediseerInstanceFlags = { - flag: "RESTRICTED" | "MUTED"; - comment: string; -}; - -export type IFediseerTag = { - tag: string; - count?: number; - rank?: number; -}; - -export type IFediseerInstanceData = { - id: number; - domain: string; - software: string; - version: string; - claimed: number; - open_registrations: boolean; - email_verify: boolean; - approval_required: boolean; - has_captcha: boolean; - approvals: number; - endorsements: number; - guarantor: string; - censure_reasons: string[] | null; - sysadmins: number; - moderators: number; - - state: "UP" | "UNREACHABLE" | "OFFLINE" | "DECOMMISSIONED"; - - tags: IFediseerTag[] | string[]; - - visibility_endorsements: "OPEN" | "ENDORSED" | "PRIVATE"; - visibility_censures: "OPEN" | "ENDORSED" | "PRIVATE"; - visibility_hesitations: "OPEN" | "ENDORSED" | "PRIVATE"; - - flags: IFediseerInstanceFlags[]; -}; +import { IFediseerInstanceData } from "../../../../types/storage"; export default class Fediseer { private storage: CrawlStorage; diff --git a/crawler/src/lib/storage/fediverse.ts b/crawler/src/lib/storage/fediverse.ts index 4c70869..86bbcfe 100644 --- a/crawler/src/lib/storage/fediverse.ts +++ b/crawler/src/lib/storage/fediverse.ts @@ -1,17 +1,6 @@ import { CrawlStorage } from "../crawlStorage"; -export type IFediverseData = { - time?: number; - baseurl?: string; - name?: string; - version?: string; - repository?: string; - homepage?: string; -}; - -export type IFediverseDataKeyValue = { - [key: string]: IFediverseData; -}; +import { IFediverseData, IFediverseDataKeyValue } from "../../../../types/storage"; export default class Fediverse { private storage: CrawlStorage; diff --git a/crawler/src/lib/storage/instance.ts b/crawler/src/lib/storage/instance.ts index 229f527..e60519b 100644 --- a/crawler/src/lib/storage/instance.ts +++ b/crawler/src/lib/storage/instance.ts @@ -1,23 +1,13 @@ import { CrawlStorage } from "../crawlStorage"; +import { IInstanceData, IInstanceDataKeyValue } from "../../../../types/storage"; + /** * Stores each lemmy instance, keyed on baseUrl as `instance:baseUrl`. * * Each instance is stored as a JSON object with the following fields: */ -export type IInstanceData = { - nodeData: any; - siteData: any; - headers: any; - langs: Array; - lastCrawled: number; -}; - -export type IInstanceDataKeyValue = { - [key: string]: IInstanceData; -}; - export default class Instance { private storage: CrawlStorage; diff --git a/crawler/src/lib/storage/mbin.ts b/crawler/src/lib/storage/mbin.ts index f7b1313..09c1385 100644 --- a/crawler/src/lib/storage/mbin.ts +++ b/crawler/src/lib/storage/mbin.ts @@ -1,57 +1,6 @@ import { CrawlStorage } from "../crawlStorage"; -export type IMagazineData = { - magazineId: number; - owner: { - magazineId: number; - userId: number; - avatar: any; - username: string; - apId: any; - }; - icon: { - storageUrl: string; - [key: string]: any; - }; - name: string; - title: string; - description: string; - rules: string; - subscriptionsCount: number; - entryCount: number; - entryCommentCount: number; - postCount: number; - postCommentCount: number; - isAdult: boolean; - isUserSubscribed: any; - isBlockedByUser: any; - tags: string[]; - badges: { - badgeId: number; - magazineId: number; - name: string; - }[]; - moderators: { - magazineId: number; - userId: number; - avatar: { - storageUrl: string; - [key: string]: any; - }; - username: string; - apId: any; - }[]; - apId: any; - apProfileId: string; - serverSoftware: any; - serverSoftwareVersion: any; - isPostingRestrictedToMods: boolean; - lastCrawled?: number; - baseurl: string; -}; -export type IMagazineDataKeyValue = { - [key: string]: IMagazineData; -}; +import { IMagazineData, IMagazineDataKeyValue } from "../../../../types/storage"; export default class MBinStore { private storage: CrawlStorage; diff --git a/crawler/src/lib/storage/tracking.ts b/crawler/src/lib/storage/tracking.ts index bdfe67a..b9f80f1 100644 --- a/crawler/src/lib/storage/tracking.ts +++ b/crawler/src/lib/storage/tracking.ts @@ -2,30 +2,12 @@ import { CrawlStorage } from "../crawlStorage"; import { RECORD_TTL_TIMES_SECONDS } from "../const"; -export type IErrorData = { - time: number; - error: string; - stack?: string; - isAxiosError?: boolean; - requestUrl?: string; - code?: string; - url?: string; - duration?: number; -}; - -export type IErrorDataKeyValue = { - [key: string]: IErrorData; -}; - -export type ILastCrawlData = { - time: number; - duration?: number; - [key: string]: any; -}; - -export type ILastCrawlDataKeyValue = { - [key: string]: ILastCrawlData; -}; +import { + IErrorData, + IErrorDataKeyValue, + ILastCrawlData, + ILastCrawlDataKeyValue, +} from "../../../../types/storage"; export default class TrackingStore { private storage: CrawlStorage; diff --git a/crawler/src/lib/storage/uptime.ts b/crawler/src/lib/storage/uptime.ts index 3c0efba..be832b9 100644 --- a/crawler/src/lib/storage/uptime.ts +++ b/crawler/src/lib/storage/uptime.ts @@ -1,21 +1,6 @@ import { CrawlStorage } from "../crawlStorage"; -export type IUptimeNodeData = { - domain: string; - latency: number; - countryname: string; - uptime_alltime: string; - date_created: string; - date_updated: string; - date_laststats: string; - score: number; - status: number; -}; - -export type IFullUptimeData = { - timestamp: number; - nodes: IUptimeNodeData[]; -}; +import { IUptimeNodeData, IFullUptimeData } from "../../../../types/storage"; export default class Uptime { private storage: CrawlStorage; diff --git a/crawler/src/output/file_writer.ts b/crawler/src/output/file_writer.ts index 6ad3b6f..dc65abf 100644 --- a/crawler/src/output/file_writer.ts +++ b/crawler/src/output/file_writer.ts @@ -1,7 +1,15 @@ import path from "node:path"; import { open, rm, mkdir, FileHandle } from "node:fs/promises"; -import { IUptimeNodeData, IFullUptimeData } from "../lib/storage/uptime"; +import { + IMetaDataOutput, + IInstanceDataOutput, + ICommunityDataOutput, + IMBinInstanceOutput, + IMBinMagazineOutput, + IFediverseDataOutput, + IClassifiedErrorOutput, +} from "../../../types/output"; /** * OutputFileWriter - This class handles writing the output JSON files. @@ -12,115 +20,6 @@ import { IUptimeNodeData, IFullUptimeData } from "../lib/storage/uptime"; // love you all -export type IMetaDataOutput = { - instances: number; - communities: number; - mbin_instances: number; // @ NEW - magazines: number; - fediverse: number; - - time: number; - package: string; - version: string; - - linked?: any; - allowed?: any; - blocked?: any; -}; - -export type IInstanceDataOutput = { - baseurl: string; - url: string; - name: string; - desc: string; - downvotes: boolean; - nsfw: boolean; - create_admin: boolean; - private: boolean; - fed: boolean; - version: string; - open: boolean; - usage: number; - counts: Object; - icon: string; - banner: string; - langs: string[]; - date: string; - published: number; - time: number; - score: number; - uptime?: IUptimeNodeData; - isSuspicious: boolean; - metrics: Object | null; - tags: string[]; - susReason: string[]; - trust: []; - blocks: { - incoming: number; - outgoing: number; - }; - blocked: string[]; -}; - -export type ICommunityDataOutput = { - baseurl: string; - url: string; - name: string; - title: string; - desc: string; - icon: string | null; - banner: string | null; - nsfw: boolean; - counts: Object; - published: number; - time: number; - isSuspicious: boolean; - score: number; -}; - -export type IMBinInstanceOutput = { - // actor_id: string; - // title: string; - // name: string; - // preferred: string; - // baseurl: string; - // summary: string; - // sensitive: boolean; - // postingRestrictedToMods: boolean; - // icon: string; - // published: string; - // updated: string; - // followers: number; - // time: number; -}; - -export type IMBinMagazineOutput = { - baseUrl: string; - magazineId: number; - title: string; - name: string; - description: string; - isAdult: boolean; - postingRestrictedToMods: boolean; - icon: string; - subscriptions: number; - posts: number; - time: number; -}; - -export type IFediverseDataOutput = { - url: string; - software: string; - version: string; -}; - -export type IClassifiedErrorOutput = { - baseurl: string; - time: number; - error: string; - type?: string; -}; - // type IInstanceOutput = {}; // // minified version, only enough for sort/filter diff --git a/crawler/src/output/output.ts b/crawler/src/output/output.ts index 0551cf6..312bb41 100644 --- a/crawler/src/output/output.ts +++ b/crawler/src/output/output.ts @@ -8,10 +8,10 @@ import logging from "../lib/logging"; import CrawlClient from "../lib/CrawlClient"; import storage from "../lib/crawlStorage"; -import { IInstanceData, IInstanceDataKeyValue } from "../lib/storage/instance"; -import { ICommunityData, ICommunityDataKeyValue } from "../lib/storage/community"; -import { IMagazineData, IMagazineDataKeyValue } from "../lib/storage/mbin"; -import { IFediverseData, IFediverseDataKeyValue } from "../lib/storage/fediverse"; +import { IInstanceData, IInstanceDataKeyValue } from "../../../types/storage"; +import { ICommunityData, ICommunityDataKeyValue } from "../../../types/storage"; +import { IMagazineData, IMagazineDataKeyValue } from "../../../types/storage"; +import { IFediverseData, IFediverseDataKeyValue } from "../../../types/storage"; // import { IFediseerInstanceData } from "../lib/storage/fediseer"; import { @@ -19,10 +19,12 @@ import { IErrorDataKeyValue, ILastCrawlData, ILastCrawlDataKeyValue, -} from "../lib/storage/tracking"; -import { IUptimeNodeData, IFullUptimeData } from "../lib/storage/uptime"; +} from "../../../types/storage"; +import { IUptimeNodeData, IFullUptimeData } from "../../../types/storage"; -import OutputFileWriter, { +import OutputFileWriter from "./file_writer"; + +import { IMetaDataOutput, IInstanceDataOutput, ICommunityDataOutput, @@ -30,7 +32,8 @@ import OutputFileWriter, { IMBinMagazineOutput, IFediverseDataOutput, IClassifiedErrorOutput, -} from "./file_writer"; +} from "../../../types/output"; + import OutputTrust from "./trust"; class OutputUtils { @@ -991,7 +994,7 @@ export default class CrawlOutput { for (const mbin of filteredMBins) { output.push({ - baseUrl: mbin.baseurl, + baseurl: mbin.baseurl, magazineId: mbin.magazineId, title: mbin.title, // display name @@ -1002,7 +1005,7 @@ export default class CrawlOutput { isAdult: mbin.isAdult, postingRestrictedToMods: mbin.isPostingRestrictedToMods, - icon: mbin.icon ? mbin.icon.url : null, + icon: mbin.icon?.storageUrl ? mbin.icon.storageUrl : null, // published: mbin.published, // updated: mbin.updated, subscriptions: mbin.subscriptionsCount, diff --git a/crawler/src/output/trust.ts b/crawler/src/output/trust.ts index f29b07f..addccac 100644 --- a/crawler/src/output/trust.ts +++ b/crawler/src/output/trust.ts @@ -1,18 +1,18 @@ import divinator from "divinator"; import storage from "../lib/crawlStorage"; -import { IInstanceData, IInstanceDataKeyValue } from "../lib/storage/instance"; -import { ICommunityData, ICommunityDataKeyValue } from "../lib/storage/community"; -import { IMagazineData, IMagazineDataKeyValue } from "../lib/storage/mbin"; -import { IFediverseData, IFediverseDataKeyValue } from "../lib/storage/fediverse"; -import { IFediseerInstanceData } from "../lib/storage/fediseer"; +import { IInstanceData, IInstanceDataKeyValue } from "../../../types/storage"; +import { ICommunityData, ICommunityDataKeyValue } from "../../../types/storage"; +import { IMagazineData, IMagazineDataKeyValue } from "../../../types/storage"; +import { IFediverseData, IFediverseDataKeyValue } from "../../../types/storage"; +import { IFediseerInstanceData } from "../../../types/storage"; import { IErrorData, IErrorDataKeyValue, ILastCrawlData, ILastCrawlDataKeyValue, -} from "../lib/storage/tracking"; -import { IUptimeNodeData, IFullUptimeData } from "../lib/storage/uptime"; +} from "../../../types/storage"; +import { IUptimeNodeData, IFullUptimeData } from "../../../types/storage"; import { OUTPUT_MAX_AGE } from "../lib/const"; diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index f54ce36..2f3d59f 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -26,7 +26,7 @@ import Join from "./pages/Join"; import Inspector from "./pages/Inspector"; import InstanceView from "./pages/InstanceView"; -import KBinMagazines from "./pages/KBinMagazines"; +import MBinMagazines from "./pages/MBinMagazines"; import AppStore from "./store"; @@ -76,7 +76,7 @@ export default function App() { } /> } /> - } /> + } /> {/* } /> */} diff --git a/frontend/src/components/GridView/KBin.tsx b/frontend/src/components/GridView/MBin.tsx similarity index 96% rename from frontend/src/components/GridView/KBin.tsx rename to frontend/src/components/GridView/MBin.tsx index 6780546..b1fa007 100644 --- a/frontend/src/components/GridView/KBin.tsx +++ b/frontend/src/components/GridView/MBin.tsx @@ -4,7 +4,7 @@ import { useMasonry, usePositioner, useContainerPosition, useScroller } from "ma import { useWindowSize } from "@react-hook/window-size"; -import KBinCard from "./KBinCard"; +import KBinCard from "./MBinCard"; type KBinGridProps = { items: any[]; diff --git a/frontend/src/components/GridView/KBinCard.tsx b/frontend/src/components/GridView/MBinCard.tsx similarity index 89% rename from frontend/src/components/GridView/KBinCard.tsx rename to frontend/src/components/GridView/MBinCard.tsx index 96c52ee..13dad67 100644 --- a/frontend/src/components/GridView/KBinCard.tsx +++ b/frontend/src/components/GridView/MBinCard.tsx @@ -20,11 +20,13 @@ import { CopyLink, ExtCommunityLink } from "../Shared/Link"; import { IconAvatar } from "../Shared/Avatar"; -type KBinCardProps = { - magazine: any; +import { IMBinMagazineOutput } from "../../../../types/output"; + +type MBinCardProps = { + magazine: IMBinMagazineOutput; }; -export default React.memo(function KBinCard({ magazine }: KBinCardProps) { +const MBinCard = React.memo(function ({ magazine }: MBinCardProps) { return ( - {/* Community Title */} + {/* Magazine Title */} - {magazine.summary ? magazine.summary : ""} + {magazine.description ? magazine.description : ""} - + - {/* + - + + {/* ); }); + +export default MBinCard; diff --git a/frontend/src/components/Header/HeaderSideMenu.tsx b/frontend/src/components/Header/HeaderSideMenu.tsx index 736e7e0..faefb9e 100644 --- a/frontend/src/components/Header/HeaderSideMenu.tsx +++ b/frontend/src/components/Header/HeaderSideMenu.tsx @@ -17,6 +17,7 @@ import ListDivider from "@mui/joy/ListDivider"; import SvgIcon from "@mui/material/SvgIcon"; import KBinLogo from "./KBinIcon"; +import MBinIcon from "./MBinIcon"; import MenuIcon from "@mui/icons-material/Menu"; import DarkModeRoundedIcon from "@mui/icons-material/DarkModeRounded"; @@ -190,14 +191,14 @@ function HeaderSideMenu({ filterSuspicious, dispatch }: IHeaderSideMenuProps) { // color={"info"} onClick={() => { handleClose(); - navigate("/kbin/magazines"); + navigate("/mbin/magazines"); }} - {...(location.pathname === "/kbin/magazines" && { selected: true, variant: "soft" })} + {...(location.pathname === "/mbin/magazines" && { selected: true, variant: "soft" })} > - + - KBin Magazines + MBin Magazines diff --git a/frontend/src/components/Header/KBinIcon.tsx b/frontend/src/components/Header/KBinIcon.tsx index 775d7cb..d16a3a1 100644 --- a/frontend/src/components/Header/KBinIcon.tsx +++ b/frontend/src/components/Header/KBinIcon.tsx @@ -1,5 +1,5 @@ import * as React from "react"; -const SvgResult = (props) => ( +const KBinIcon = (props) => ( ( /> ); -export default SvgResult; +export default KBinIcon; diff --git a/frontend/src/components/Header/MBinIcon.tsx b/frontend/src/components/Header/MBinIcon.tsx new file mode 100644 index 0000000..1c2824a --- /dev/null +++ b/frontend/src/components/Header/MBinIcon.tsx @@ -0,0 +1,77 @@ +import * as React from "react"; +const MBinIcon = (props) => ( + + + + {/* + + + */} + + + + + + + + + + + + + + + +); +export default MBinIcon; diff --git a/frontend/src/components/Header/SelectHomeInstance.tsx b/frontend/src/components/Header/SelectHomeInstance.tsx index 55d9e76..9055190 100644 --- a/frontend/src/components/Header/SelectHomeInstance.tsx +++ b/frontend/src/components/Header/SelectHomeInstance.tsx @@ -139,7 +139,7 @@ function SelectHomeInstance({ onSetKBin, homeBaseUrl, dispatch }) { isLoading: loadingKbin, error: errorKbin, data: dataKbin, - } = useQueryCache("kbinMinData", "kbin.min"); + } = useQueryCache("kbinMinData", "mbin.min"); const data = React.useMemo(() => { if (loadingIns || loadingKbin) { diff --git a/frontend/src/components/ListView/KBin.tsx b/frontend/src/components/ListView/MBin.tsx similarity index 93% rename from frontend/src/components/ListView/KBin.tsx rename to frontend/src/components/ListView/MBin.tsx index 552e974..6a02ed0 100644 --- a/frontend/src/components/ListView/KBin.tsx +++ b/frontend/src/components/ListView/MBin.tsx @@ -15,11 +15,13 @@ import VirtualTable from "./VirtualTable"; import { TinyNumber } from "../Shared/Display"; import { CopyLink, ExtCommunityLink } from "../Shared/Link"; -type IKBinListProps = { - items: any[]; +import { IMBinMagazineOutput } from "../../../../types/output"; + +type IMBinListProps = { + items: IMBinMagazineOutput[]; }; -const KBinList = React.memo(function ({ items }: IKBinListProps) { +const MBinList = React.memo(function ({ items }: IMBinListProps) { return ( {({ width }) => [ @@ -114,4 +116,4 @@ const KBinList = React.memo(function ({ items }: IKBinListProps) { ); }); -export default KBinList; +export default MBinList; diff --git a/frontend/src/pages/KBinMagazines.tsx b/frontend/src/pages/MBinMagazines.tsx similarity index 86% rename from frontend/src/pages/KBinMagazines.tsx rename to frontend/src/pages/MBinMagazines.tsx index cda9aa8..a05662e 100644 --- a/frontend/src/pages/KBinMagazines.tsx +++ b/frontend/src/pages/MBinMagazines.tsx @@ -25,20 +25,28 @@ import ViewListIcon from "@mui/icons-material/ViewList"; import { LinearValueLoader, PageError, SimpleNumberFormat } from "../components/Shared/Display"; import TriStateCheckbox from "../components/Shared/TriStateCheckbox"; -import KBinGrid from "../components/GridView/KBin"; -import KBinList from "../components/ListView/KBin"; +import MBinGrid from "../components/GridView/MBin"; +import MBinList from "../components/ListView/MBin"; -function KBinMagazines() { +import { IMBinMagazineOutput } from "../../../types/output"; + +function MBinMagazines() { const [searchParams, setSearchParams] = useSearchParams(); - const { isLoading, loadingPercent, isSuccess, isError, error, data } = useCachedMultipart( - "magazinesData", - "magazines", - ); + const { + isLoading, + loadingPercent, + isSuccess, + isError, + error, + data: tyhisDatya, + } = useCachedMultipart("magazinesData", "magazines"); + + const magData: IMBinMagazineOutput[] = tyhisDatya; - const [viewType, setViewType] = useStorage("kbin.viewType", "grid"); + const [viewType, setViewType] = useStorage("mbin.viewType", "grid"); - const [orderBy, setOrderBy] = React.useState("followers"); + const [orderBy, setOrderBy] = React.useState("subscribers"); const [showNSFW, setShowNSFW] = React.useState(false); // debounce the filter text input @@ -69,34 +77,34 @@ function KBinMagazines() { // this applies the filtering and sorting to the data loaded from .json const magazinesData = React.useMemo(() => { if (isError) return []; - if (!data) return []; + if (!magData) return []; console.time("sort+filter magazines"); - console.log(`Loaded ${data.length} magazines`); + console.log(`Loaded ${magData.length} magazines`); - let communties = [...data]; + let communties = [...magData]; console.log(`Sorting magazines by ${orderBy}`); // Variable "ShowNSFW" is used to drive this // Default: Hide NSFW false if (showNSFW == false) { - console.log(`Hiding NSFW magazines`); + console.log(`Hiding isAdult magazines`); communties = communties.filter((community) => { - return !community.sensitive; + return !community.isAdult; }); } // One Click: Include NSFW null else if (showNSFW == null) { - console.log(`Including NSFW magazines`); + console.log(`Including isAdult magazines`); } // Two Clicks: NSFW Only true else if (showNSFW == true) { console.log(`Showing NSFW magazines`); communties = communties.filter((community) => { - return community.sensitive; + return community.isAdult; }); } @@ -130,7 +138,7 @@ function KBinMagazines() { (community.name && community.name.toLowerCase().includes(term)) || (community.title && community.title.toLowerCase().includes(term)) || (community.baseurl && community.baseurl.toLowerCase().includes(term)) || - (community.summary && community.summary.toLowerCase().includes(term)) + (community.description && community.description.toLowerCase().includes(term)) ); }); }); @@ -145,7 +153,7 @@ function KBinMagazines() { (community.name && community.name.toLowerCase().includes(term)) || (community.title && community.title.toLowerCase().includes(term)) || (community.baseurl && community.baseurl.toLowerCase().includes(term)) || - (community.summary && community.summary.toLowerCase().includes(term)) + (community.description && community.description.toLowerCase().includes(term)) ); }); }); @@ -155,16 +163,16 @@ function KBinMagazines() { // sorting if (orderBy === "followers") { - communties = communties.sort((a, b) => b.followers - a.followers); + communties = communties.sort((a, b) => b.subscriptions - a.subscriptions); } else if (orderBy === "name") { - communties = communties.sort((a, b) => b.name - a.name); + communties = communties.sort((a, b) => a.name.localeCompare(b.name)); } console.log(`Sorted ${communties.length} magazines`); console.log( `updating magazines data with ${communties.length} magazines, removed: ${ - data.length - communties.length + magData.length - communties.length }`, ); @@ -172,7 +180,7 @@ function KBinMagazines() { // return a clone so that it triggers a re-render on sort return [...communties]; - }, [data]); + }, [magData]); return ( } {isError && } - {isSuccess && viewType == "grid" && } - {isSuccess && viewType == "list" && } + {isSuccess && viewType == "grid" && } + {isSuccess && viewType == "list" && } ); } -export default React.memo(KBinMagazines); +export default React.memo(MBinMagazines); diff --git a/pages/src/App.tsx b/pages/src/App.tsx index 435ab74..e67e20e 100644 --- a/pages/src/App.tsx +++ b/pages/src/App.tsx @@ -99,11 +99,11 @@ export default function App() { count: "communities", }, { - name: "KBin Instances", + name: "MBin Instances", chip: "Min", - desc: "an array of known kbin instances", - path: "kbin.min.json", - count: "kbin_instances", + desc: "an array of known mbin instances", + path: "mbin.min.json", + count: "mbin_instances", }, { name: "KBin Magazines", diff --git a/types/output.ts b/types/output.ts new file mode 100644 index 0000000..04b8dd5 --- /dev/null +++ b/types/output.ts @@ -0,0 +1,110 @@ +import { IUptimeNodeData } from "./storage"; + +export type IMetaDataOutput = { + instances: number; + communities: number; + mbin_instances: number; // @ NEW + magazines: number; + fediverse: number; + + time: number; + package: string; + version: string; + + linked?: any; + allowed?: any; + blocked?: any; +}; + +export type IInstanceDataOutput = { + baseurl: string; + url: string; + name: string; + desc: string; + downvotes: boolean; + nsfw: boolean; + create_admin: boolean; + private: boolean; + fed: boolean; + version: string; + open: boolean; + usage: number; + counts: Object; + icon: string; + banner: string; + langs: string[]; + date: string; + published: number; + time: number; + score: number; + uptime?: IUptimeNodeData; + isSuspicious: boolean; + metrics: Object | null; + tags: string[]; + susReason: string[]; + trust: []; + blocks: { + incoming: number; + outgoing: number; + }; + blocked: string[]; +}; + +export type ICommunityDataOutput = { + baseurl: string; + url: string; + name: string; + title: string; + desc: string; + icon: string | null; + banner: string | null; + nsfw: boolean; + counts: Object; + published: number; + time: number; + isSuspicious: boolean; + score: number; +}; + +export type IMBinInstanceOutput = { + // actor_id: string; + // title: string; + // name: string; + // preferred: string; + // baseurl: string; + // summary: string; + // sensitive: boolean; + // postingRestrictedToMods: boolean; + // icon: string; + // published: string; + // updated: string; + // followers: number; + // time: number; +}; + +export type IMBinMagazineOutput = { + baseurl: string; + magazineId: number; + title: string; + name: string; + description: string; + isAdult: boolean; + postingRestrictedToMods: boolean; + icon: string | null; + subscriptions: number; + posts: number; + time: number; +}; + +export type IFediverseDataOutput = { + url: string; + software: string; + version: string; +}; + +export type IClassifiedErrorOutput = { + baseurl: string; + time: number; + error: string; + type?: string; +}; diff --git a/types/storage.ts b/types/storage.ts new file mode 100644 index 0000000..3b24236 --- /dev/null +++ b/types/storage.ts @@ -0,0 +1,202 @@ +/// COMMUNITY + +export type ICommunityData = { + community: { + id: number; + name: string; + title: string; + description: string; + removed: boolean; + published: string; + updated: string | null; + deleted: boolean; + nsfw: boolean; + actor_id: string; + local: boolean; + icon: string | null; + banner: string | null; + hidden: boolean; + posting_restricted_to_mods: boolean; + instance_id: number; + }; + subscribed: string; + blocked: boolean; + counts: Object; + banned_from_community?: boolean; + lastCrawled: number; +}; + +export type ICommunityDataKeyValue = { + [key: string]: ICommunityData; +}; + +// FEDISEER + +export type IFediseerInstanceFlags = { + flag: "RESTRICTED" | "MUTED"; + comment: string; +}; + +export type IFediseerTag = { + tag: string; + count?: number; + rank?: number; +}; + +export type IFediseerInstanceData = { + id: number; + domain: string; + software: string; + version: string; + claimed: number; + open_registrations: boolean; + email_verify: boolean; + approval_required: boolean; + has_captcha: boolean; + approvals: number; + endorsements: number; + guarantor: string; + censure_reasons: string[] | null; + sysadmins: number; + moderators: number; + + state: "UP" | "UNREACHABLE" | "OFFLINE" | "DECOMMISSIONED"; + + tags: IFediseerTag[] | string[]; + + visibility_endorsements: "OPEN" | "ENDORSED" | "PRIVATE"; + visibility_censures: "OPEN" | "ENDORSED" | "PRIVATE"; + visibility_hesitations: "OPEN" | "ENDORSED" | "PRIVATE"; + + flags: IFediseerInstanceFlags[]; +}; + +/// FEDIVERSE + +export type IFediverseData = { + time?: number; + baseurl?: string; + name?: string; + version?: string; + repository?: string; + homepage?: string; +}; + +export type IFediverseDataKeyValue = { + [key: string]: IFediverseData; +}; + +/// INSTANCEC + +export type IInstanceData = { + nodeData: any; + siteData: any; + headers: any; + langs: Array; + lastCrawled: number; +}; + +export type IInstanceDataKeyValue = { + [key: string]: IInstanceData; +}; + +///// MBIN + +export type IMagazineData = { + magazineId: number; + owner: { + magazineId: number; + userId: number; + avatar: any; + username: string; + apId: any; + }; + icon: { + storageUrl: string; + [key: string]: any; + } | null; + name: string; + title: string; + description: string; + rules: string; + subscriptionsCount: number; + entryCount: number; + entryCommentCount: number; + postCount: number; + postCommentCount: number; + isAdult: boolean; + isUserSubscribed: any; + isBlockedByUser: any; + tags: string[]; + badges: { + badgeId: number; + magazineId: number; + name: string; + }[]; + moderators: { + magazineId: number; + userId: number; + avatar: { + storageUrl: string; + [key: string]: any; + }; + username: string; + apId: any; + }[]; + apId: any; + apProfileId: string; + serverSoftware: any; + serverSoftwareVersion: any; + isPostingRestrictedToMods: boolean; + lastCrawled?: number; + baseurl: string; +}; +export type IMagazineDataKeyValue = { + [key: string]: IMagazineData; +}; + +//// TRACKING + +export type IErrorData = { + time: number; + error: string; + stack?: string; + isAxiosError?: boolean; + requestUrl?: string; + code?: string; + url?: string; + duration?: number; +}; + +export type IErrorDataKeyValue = { + [key: string]: IErrorData; +}; + +export type ILastCrawlData = { + time: number; + duration?: number; + [key: string]: any; +}; + +export type ILastCrawlDataKeyValue = { + [key: string]: ILastCrawlData; +}; + +//// UPTIME + +export type IUptimeNodeData = { + domain: string; + latency: number; + countryname: string; + uptime_alltime: string; + date_created: string; + date_updated: string; + date_laststats: string; + score: number; + status: number; +}; + +export type IFullUptimeData = { + timestamp: number; + nodes: IUptimeNodeData[]; +}; From 086404a6ae647d3a2bacda9f7ab6bdc4319dfe0a Mon Sep 17 00:00:00 2001 From: tgxn Date: Wed, 8 Jan 2025 21:09:52 +0800 Subject: [PATCH 12/15] update to render mbin magazines --- frontend/index.html | 1 + frontend/public/icons/MBin_Logo.svg | 78 +++++++++++++++++++ frontend/src/components/Header/KBinIcon.tsx | 49 ------------ frontend/src/components/Header/MBinIcon.tsx | 85 +++------------------ frontend/src/components/ListView/MBin.tsx | 24 ++++-- frontend/src/pages/MBinMagazines.tsx | 7 +- 6 files changed, 113 insertions(+), 131 deletions(-) create mode 100644 frontend/public/icons/MBin_Logo.svg delete mode 100644 frontend/src/components/Header/KBinIcon.tsx diff --git a/frontend/index.html b/frontend/index.html index 4464c0a..a2be688 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -3,6 +3,7 @@ + diff --git a/frontend/public/icons/MBin_Logo.svg b/frontend/public/icons/MBin_Logo.svg new file mode 100644 index 0000000..cf84a8e --- /dev/null +++ b/frontend/public/icons/MBin_Logo.svg @@ -0,0 +1,78 @@ + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/frontend/src/components/Header/KBinIcon.tsx b/frontend/src/components/Header/KBinIcon.tsx deleted file mode 100644 index d16a3a1..0000000 --- a/frontend/src/components/Header/KBinIcon.tsx +++ /dev/null @@ -1,49 +0,0 @@ -import * as React from "react"; -const KBinIcon = (props) => ( - - - - - - - - - - - - - - -); -export default KBinIcon; diff --git a/frontend/src/components/Header/MBinIcon.tsx b/frontend/src/components/Header/MBinIcon.tsx index 1c2824a..60ed224 100644 --- a/frontend/src/components/Header/MBinIcon.tsx +++ b/frontend/src/components/Header/MBinIcon.tsx @@ -1,77 +1,16 @@ import * as React from "react"; + +import Box from "@mui/joy/Box"; + const MBinIcon = (props) => ( - - - - {/* - - - */} - - - - - - - - - - - - - - - + ); export default MBinIcon; diff --git a/frontend/src/components/ListView/MBin.tsx b/frontend/src/components/ListView/MBin.tsx index 6a02ed0..21d88d2 100644 --- a/frontend/src/components/ListView/MBin.tsx +++ b/frontend/src/components/ListView/MBin.tsx @@ -35,7 +35,7 @@ const MBinList = React.memo(function ({ items }: IMBinListProps) { headerStyle={{ justifyContent: "left", }} - cellRenderer={({ rowData }) => { + cellRenderer={({ rowData }: { rowData: IMBinMagazineOutput }) => { // console.log(rowData); return ( @@ -78,15 +78,15 @@ const MBinList = React.memo(function ({ items }: IMBinListProps) { baseType="kbin" community={{ baseurl: rowData.baseurl, // for link - name: rowData.preferred, // for link - title: rowData.name, // for display + name: rowData.name, // for link + title: rowData.title, // for display }} /> , } + cellRenderer={({ rowData }) => } + />, + } />, ]} diff --git a/frontend/src/pages/MBinMagazines.tsx b/frontend/src/pages/MBinMagazines.tsx index a05662e..cb576a4 100644 --- a/frontend/src/pages/MBinMagazines.tsx +++ b/frontend/src/pages/MBinMagazines.tsx @@ -162,8 +162,10 @@ function MBinMagazines() { console.log(`Filtered ${communties.length} magazines`); // sorting - if (orderBy === "followers") { + if (orderBy === "subscriptions") { communties = communties.sort((a, b) => b.subscriptions - a.subscriptions); + } else if (orderBy === "posts") { + communties = communties.sort((a, b) => b.posts - a.posts); } else if (orderBy === "name") { communties = communties.sort((a, b) => a.name.localeCompare(b.name)); } @@ -234,7 +236,8 @@ function MBinMagazines() { }} > {/* */} - + + From f8697a9b4e48b5e7645edfd11038c7dbbab3a3f8 Mon Sep 17 00:00:00 2001 From: tgxn Date: Wed, 8 Jan 2025 21:13:30 +0800 Subject: [PATCH 13/15] ok, ui mbin complete --- frontend/src/pages/MBinMagazines.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/pages/MBinMagazines.tsx b/frontend/src/pages/MBinMagazines.tsx index cb576a4..1977733 100644 --- a/frontend/src/pages/MBinMagazines.tsx +++ b/frontend/src/pages/MBinMagazines.tsx @@ -46,7 +46,7 @@ function MBinMagazines() { const [viewType, setViewType] = useStorage("mbin.viewType", "grid"); - const [orderBy, setOrderBy] = React.useState("subscribers"); + const [orderBy, setOrderBy] = React.useState("subscriptions"); const [showNSFW, setShowNSFW] = React.useState(false); // debounce the filter text input From 74b36e437b30f1b4c6397c84d6a8269417f7ad6c Mon Sep 17 00:00:00 2001 From: tgxn Date: Wed, 8 Jan 2025 21:58:52 +0800 Subject: [PATCH 14/15] remove kbin references --- README.md | 31 ++++++++--------- crawler/README.md | 33 ++++++++++--------- crawler/src/crawl/mbin.ts | 2 +- frontend/src/components/GridView/MBin.tsx | 10 +++--- frontend/src/components/GridView/MBinCard.tsx | 2 +- frontend/src/components/Header/Header.tsx | 4 +-- .../src/components/Header/HeaderSideMenu.tsx | 1 - .../components/Header/HomeInstanceButton.tsx | 26 +++++++-------- .../components/Header/SelectHomeInstance.tsx | 26 +++++++-------- frontend/src/components/ListView/MBin.tsx | 2 +- frontend/src/components/Shared/Link.tsx | 4 +-- pages/src/App.tsx | 4 +-- 12 files changed, 73 insertions(+), 72 deletions(-) diff --git a/README.md b/README.md index a298529..485709f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ [![publish-pages](https://github.com/tgxn/lemmy-explorer/actions/workflows/publish-pages.yaml/badge.svg)](https://github.com/tgxn/lemmy-explorer/actions/workflows/publish-pages.yaml) # Lemmy Explorer https://lemmyverse.net/ + Data Dumps: https://data.lemmyverse.net/ This project provides a simple way to explore Lemmy Instances and Communities. @@ -8,6 +9,7 @@ This project provides a simple way to explore Lemmy Instances and Communities. ![List of Communities](./docs/images/communities.png) The project consists of four modules: + 1. Crawler (NodeJS, Redis) `/crawler` 2. Frontend (ReactJS, MUI Joy, TanStack) `/frontend` 3. Deploy (Amazon CDK v2) `/cdk` @@ -20,16 +22,18 @@ The project consists of four modules: You can append `home_url` and (optionally) `home_type` to the URL to set the home instance and type. `?home_url=lemmy.example.com` -`?home_url=kbin.example.com&home_type=kbin` +`?home_url=mbin.example.com&home_type=mbin` - > `home_type` supports "lemmy" and "kbin" (default is "lemmy") +> `home_type` supports "lemmy" and "mbin" (default is "lemmy") ### Q: **How does discovery work?** + It uses a [seed list of communities](https://github.com/tgxn/lemmy-explorer/blob/main/crawler/src/lib/const.js#L47) and scans the equivalent of the `/instances` federation lists, and then creates jobs to scan each of those servers. Additionally, instance tags and trust data is fetched from [Fediseer](https://gui.fediseer.com/). ### Q: **How does the NSFW filter work?** + The NSFW filter is a client-side filter that filters out NSFW communities and instances from results by default. The "NSFW Toggle" checkbox has thress states that you can toggle through: | State | Filter | Value | @@ -38,10 +42,10 @@ The "NSFW Toggle" checkbox has thress states that you can toggle through: | One Click | Include NSFW | null | | Two Clicks | NSFW Only | true | -When you try to switch to a non-sfw state, a popup will appear to confirm your choice. You can save your response in your browsers cache and it will be remembered. - +When you try to switch to a non-sfw state, a popup will appear to confirm your choice. You can save your response in your browsers cache and it will be remembered. ### Q: **How long till my instance shows up?** + How long it takes to discover a new instance can vary depending on if you post content that's picked up by one of these servers. Since the crawler looks at lists of federated instances, we can't discover instances that aren't on those lists. @@ -49,6 +53,7 @@ Since the crawler looks at lists of federated instances, we can't discover insta Additionally, the lists are cached for 24 hours, so it can take up to 24 hours for an instance to show up after it's been discovered till it shows up. ### Q: **Can I use your data in my app/website/project?** + I do not own any of the data retrieved by the crawler, it is available from public endpoints on the source instances. You are free to pull data from the GitHub pages site: @@ -66,21 +71,21 @@ You can also download [Latest ZIP](https://nightly.link/tgxn/lemmy-explorer/work `dist-json-bundle.zip` file contains the data in JSON format: - - `communities.full.json` - list of all communities - - `instances.full.json` - list of all instances - - `overview.json` - metadata and counts - +- `communities.full.json` - list of all communities +- `instances.full.json` - list of all instances +- `overview.json` - metadata and counts ## Crawler + [Crawler README](./crawler/README.md) ## Frontend + [Frontend README](./frontend/README.md) ## Data Site -[Data Site README](./pages/README.md) - +[Data Site README](./pages/README.md) ## Deploy @@ -90,9 +95,6 @@ The deploy is an Amazon CDK v2 project that deploys the crawler and frontend to then run `cdk deploy --all` to deploy the frontend to AWS. - - - ## Similar Sites - https://browse.feddit.de/ @@ -102,8 +104,8 @@ then run `cdk deploy --all` to deploy the frontend to AWS. - https://browse.toast.ooo/ - https://lemmyfind.quex.cc/ - ## Lemmy Stats Pages + - https://lemmy.fediverse.observer/dailystats - https://the-federation.info/platform/73 - https://fedidb.org/software/lemmy @@ -117,4 +119,3 @@ then run `cdk deploy --all` to deploy the frontend to AWS. # Credits Logo made by Andy Cuccaro (@andycuccaro) under the CC-BY-SA 4.0 license. - diff --git a/crawler/README.md b/crawler/README.md index 403fb0b..45863d9 100644 --- a/crawler/README.md +++ b/crawler/README.md @@ -48,7 +48,7 @@ These immediately run a specific task. | `--init` | Initialize queue with seed jobs | | `--health` | Check worker health | | `--aged` | Create jobs for aged instances and communities | -| `--kbin` | Create jobs for kbin communities | +| `--mbin` | Create jobs for mbin communities | | `--uptime` | Immediately crawl uptime data | | `--fedi` | Immediately crawl Fediseer data | @@ -73,7 +73,7 @@ These start a worker that will run continuously, processing jobs from the releva | `-w instance` | Crawl instances from the queue | | `-w community` | Crawl communities from the queue | | `-w single` | Crawl single communities from the queue | -| `-w kbin` | Crawl kbin communities from the queue | +| `-w mbin` | Crawl mbin communities from the queue | | `-w cron` | Schedule all CRON jobs for aged instances and communities, etc | #### **Examples** @@ -94,7 +94,7 @@ These start a worker that will run a single job, then exit. | `-m [i\|instance] ` | Crawl a single instance | | `-m [c\|community] ` | Crawl a single instance's community list | | `-m [s\|single] ` | Crawl a single community, delete if not exists | -| `-m [k\|kbin] ` | Crawl a single community | +| `-m [m\|mbin] ` | Crawl a single mbin instance | #### **Examples** @@ -126,7 +126,7 @@ Crawlers are tasks created to perform an action, which could be crawling an inst | `community` | Community Crawling | | `fediseer` | Fediseer Crawling | | `uptime` | Uptime Crawling | -| `kbin` | Kbin Crawling | +| `mbin` | MBin Crawling | ### Queues @@ -137,7 +137,7 @@ Queues are where Tasks can be placed to be processed. | `instance` | Crawl an instance | | `community_list` | Crawl a community | | `community_single` | Crawl a single community | -| `kbin` | Crawl a kbin community | +| `mbin` | Crawl a mbin instance | ## Storage @@ -146,16 +146,17 @@ Redis is used to store crawled data. You can use `docker compose up -d` to start a local redis server. Data is persisted to a `.data/redis` directory. -| Redis Key | Description | -| -------------- | ----------------------------------------------------- | -| `attributes:*` | Tracked attribute sets _(change over time)_ | -| `community:*` | Community details | -| `deleted:*` | Deleted data _(recycle bin if something broken)_ | -| `error:*` | Exception details | -| `fediverse:*` | Fediverse data | -| `instance:*` | Instance details | -| `last_crawl:*` | Last crawl time for instances and communities | -| `magazine:*` | Magazine data _(kbin magazines)_ | -| `uptime:*` | Uptime data _(fetched from `api.fediverse.observer`)_ | +| Redis Key | Description | +| ----------------- | ----------------------------------------------------- | +| `attributes:*` | Tracked attribute sets _(change over time)_ | +| `community:*` | Community details | +| `deleted:*` | Deleted data _(recycle bin if something broken)_ | +| `error:*` | Exception details | +| `fediverse:*` | Fediverse data | +| `instance:*` | Instance details | +| `last_crawl:*` | Last crawl time for instances and communities | +| `mbin_instance:*` | MBin Instances | +| `magazine:*` | Magazine data _(mbin magazines)_ | +| `uptime:*` | Uptime data _(fetched from `api.fediverse.observer`)_ | Most of the keys have sub keys for the instance `base_url` or community `base_url:community_name`. diff --git a/crawler/src/crawl/mbin.ts b/crawler/src/crawl/mbin.ts index f6ff546..c6f4cf5 100644 --- a/crawler/src/crawl/mbin.ts +++ b/crawler/src/crawl/mbin.ts @@ -347,7 +347,7 @@ export const mbinInstanceProcessor: IJobProcessor = async ({ baseUrl }) => { const startTime = Date.now(); try { - // check for recent scan of this kbin instance + // check for recent scan of this mbin instance const lastCrawl = await storage.tracking.getLastCrawl("mbin", baseUrl); if (lastCrawl) { const lastCrawledMsAgo = Date.now() - lastCrawl.time; diff --git a/frontend/src/components/GridView/MBin.tsx b/frontend/src/components/GridView/MBin.tsx index b1fa007..f57ca87 100644 --- a/frontend/src/components/GridView/MBin.tsx +++ b/frontend/src/components/GridView/MBin.tsx @@ -4,13 +4,13 @@ import { useMasonry, usePositioner, useContainerPosition, useScroller } from "ma import { useWindowSize } from "@react-hook/window-size"; -import KBinCard from "./MBinCard"; +import NMBinCard from "./MBinCard"; -type KBinGridProps = { +type MBinGridProps = { items: any[]; }; -const KBinGrid = React.memo(function ({ items }: KBinGridProps) { +const MBinGrid = React.memo(function ({ items }: MBinGridProps) { const containerRef = React.useRef(null); const [windowWidth, height] = useWindowSize(); @@ -20,7 +20,7 @@ const KBinGrid = React.memo(function ({ items }: KBinGridProps) { const { scrollTop, isScrolling } = useScroller(offset); - const CardAsCallback = React.useCallback((props) => , [isScrolling]); + const CardAsCallback = React.useCallback((props) => , [isScrolling]); return useMasonry({ containerRef, @@ -33,4 +33,4 @@ const KBinGrid = React.memo(function ({ items }: KBinGridProps) { render: CardAsCallback, }); }); -export default KBinGrid; +export default MBinGrid; diff --git a/frontend/src/components/GridView/MBinCard.tsx b/frontend/src/components/GridView/MBinCard.tsx index 13dad67..3ddd5d1 100644 --- a/frontend/src/components/GridView/MBinCard.tsx +++ b/frontend/src/components/GridView/MBinCard.tsx @@ -81,7 +81,7 @@ const MBinCard = React.memo(function ({ magazine }: MBinCardProps) { }} > (instanceType === "mbin"); - const setIsKbinInstance = (isKbin) => { - _setIsKbinInstance(isKbin); - dispatch(changeInstanceType(isKbin ? "kbin" : "lemmy")); + const setIsMBinInstance = (isMBin: boolean) => { + _setIsMBinInstance(isMBin); + dispatch(changeInstanceType(isMBin ? "mbin" : "lemmy")); }; const [anchorEl, setAnchorEl] = React.useState(null); const [menuOpen, setMenuOpen] = React.useState(false); useEffect(() => { - setIsKbinInstance(instanceType === "kbin"); + setIsMBinInstance(instanceType === "mbin"); }, [instanceType]); const handleClick = (event) => { @@ -97,7 +97,7 @@ function HomeInstanceButton({ homeBaseUrl, instanceType, dispatch }) { color: "text.secondary", }} > - setIsKbinInstance(isKbin)} /> + setIsMBinInstance(isMBin)} /> {homeBaseUrl && ( @@ -111,17 +111,17 @@ function HomeInstanceButton({ homeBaseUrl, instanceType, dispatch }) { > setIsKbinInstance(event.target.checked)} + color={isMBinInstance ? "warning" : "success"} + checked={isMBinInstance} + onChange={(event) => setIsMBinInstance(event.target.checked)} // labelPlacement="end" /> - {!isKbinInstance && Lemmy Instance} - {isKbinInstance && KBin Instance} - {!isKbinInstance && Instance links use /c/} - {isKbinInstance && Instance links use /m/} + {!isMBinInstance && Lemmy Instance} + {isMBinInstance && MBin Instance} + {!isMBinInstance && Instance links use /c/} + {isMBinInstance && Instance links use /m/} )} diff --git a/frontend/src/components/Header/SelectHomeInstance.tsx b/frontend/src/components/Header/SelectHomeInstance.tsx index 9055190..af848c5 100644 --- a/frontend/src/components/Header/SelectHomeInstance.tsx +++ b/frontend/src/components/Header/SelectHomeInstance.tsx @@ -128,7 +128,7 @@ const ListboxComponent = React.forwardRef(function ListboxComponent(props: IList ); }); -function SelectHomeInstance({ onSetKBin, homeBaseUrl, dispatch }) { +function SelectHomeInstance({ onSetMBin, homeBaseUrl, dispatch }) { const { isLoading: loadingIns, error: errorIns, @@ -136,17 +136,17 @@ function SelectHomeInstance({ onSetKBin, homeBaseUrl, dispatch }) { } = useQueryCache("instanceMinData", "instance.min"); const { - isLoading: loadingKbin, - error: errorKbin, - data: dataKbin, - } = useQueryCache("kbinMinData", "mbin.min"); + isLoading: loadingMBin, + error: errorMBin, + data: dataMBin, + } = useQueryCache("mbinMinData", "mbin.min"); const data = React.useMemo(() => { - if (loadingIns || loadingKbin) { + if (loadingIns || loadingMBin) { return null; } - if (errorIns || errorKbin) { + if (errorIns || errorMBin) { return null; } @@ -154,25 +154,25 @@ function SelectHomeInstance({ onSetKBin, homeBaseUrl, dispatch }) { data = data.concat(dataIns.map((item) => ({ ...item, type: "lemmy" }))); - for (const item of dataKbin) { + for (const item of dataMBin) { data.push({ base: item, name: item, - type: "kbin", + type: "mbin", }); } return data; - }, [dataIns, dataKbin]); + }, [dataIns, dataMBin]); const onChange = (newValue) => { console.log("onChange", newValue); - if (newValue?.type === "kbin") { - onSetKBin(true); + if (newValue?.type === "mbin") { + onSetMBin(true); // return; } else if (newValue?.type === "lemmy") { - onSetKBin(false); + onSetMBin(false); } if (newValue == null) { diff --git a/frontend/src/components/ListView/MBin.tsx b/frontend/src/components/ListView/MBin.tsx index 21d88d2..d4fbb3c 100644 --- a/frontend/src/components/ListView/MBin.tsx +++ b/frontend/src/components/ListView/MBin.tsx @@ -75,7 +75,7 @@ const MBinList = React.memo(function ({ items }: IMBinListProps) { }} > Date: Thu, 9 Jan 2025 00:46:23 +0800 Subject: [PATCH 15/15] add check --- crawler/src/crawl/mbin.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler/src/crawl/mbin.ts b/crawler/src/crawl/mbin.ts index c6f4cf5..4201311 100644 --- a/crawler/src/crawl/mbin.ts +++ b/crawler/src/crawl/mbin.ts @@ -312,7 +312,7 @@ export default class CrawlMBin { baseurl: crawlDomain, ...magazineData, - icon: magazineData.icon.storageUrl ? magazineData.icon.storageUrl : null, + icon: magazineData?.icon?.storageUrl ? magazineData.icon.storageUrl : null, lastCrawled: Date.now(), };