diff --git a/crawler/src/lib/const.ts b/crawler/src/lib/const.ts index 7c7fa1b..472abf2 100644 --- a/crawler/src/lib/const.ts +++ b/crawler/src/lib/const.ts @@ -38,6 +38,8 @@ export const CRAWL_AGED_TIME = { // if a server is identified as a non-lemmy server, ho often should we wait before checking again? FEDIVERSE: hoursToMs(2 * 24), // 2 days + + MAGAZINE: hoursToMs(8), }; // consider for deletion after they haven't been seen for this long diff --git a/crawler/src/lib/crawlStorage.ts b/crawler/src/lib/crawlStorage.ts index 776c6bc..5be2069 100644 --- a/crawler/src/lib/crawlStorage.ts +++ b/crawler/src/lib/crawlStorage.ts @@ -6,7 +6,6 @@ import logging from "./logging"; // core import InstanceStore from "./storage/instance"; import CommunityStore from "./storage/community"; -import KBinStore from "./storage/kbin"; import MBinStore from "./storage/mbin"; // supporting @@ -29,7 +28,6 @@ export class CrawlStorage { public fediverse: FediverseStore; public fediseer: FediseerStore; public tracking: TrackingStore; - public kbin: KBinStore; public mbin: MBinStore; constructor() { @@ -46,7 +44,6 @@ export class CrawlStorage { this.fediverse = new FediverseStore(this); this.fediseer = new FediseerStore(this); this.tracking = new TrackingStore(this); - this.kbin = new KBinStore(this); this.mbin = new MBinStore(this); } diff --git a/crawler/src/lib/storage/kbin.ts b/crawler/src/lib/storage/kbin.ts deleted file mode 100644 index 6f4cc2b..0000000 --- a/crawler/src/lib/storage/kbin.ts +++ /dev/null @@ -1,66 +0,0 @@ -import { CrawlStorage } from "../crawlStorage"; - -export type IMagazineData = { - baseUrl: string; - name: string; - description: string; - lastCrawled: number; - [key: string]: any; -}; - -export type IMagazineDataKeyValue = { - [key: string]: IMagazineData; -}; - -export default class KBinStore { - private storage: CrawlStorage; - - constructor(storage: CrawlStorage) { - this.storage = storage; - } - - async getAll(): Promise { - return this.storage.listRedis(`magazine:*`); - } - - async getAllWithKeys(): Promise { - return this.storage.listRedisWithKeys(`magazine:*`); - } - - async getOne(baseUrl: string, magazineName: string) { - return this.storage.getRedis(`magazine:${baseUrl}:${magazineName}`); - } - - async upsert(baseUrl: string, magazine: IMagazineData) { - const storeData = { - ...magazine, - lastCrawled: Date.now(), - }; - return this.storage.putRedis(`magazine:${baseUrl}:${magazine.name.toLowerCase()}`, storeData); - } - - async delete(baseUrl: string, magazineName: string, reason = "unknown") { - const oldRecord = await this.getOne(baseUrl, magazineName); - await this.storage.putRedis(`deleted:magazine:${baseUrl}:${magazineName}`, { - ...oldRecord, - deletedAt: Date.now(), - deleteReason: reason, - }); - - return this.storage.deleteRedis(`magazine:${baseUrl}:${magazineName}`); - } - - // use these to track magazine attributes over time - async setTrackedAttribute( - baseUrl: string, - magazineName: string, - attributeName: string, - attributeValue: string, - ) { - return await this.storage.redisZAdd( - `attributes:magazine:${baseUrl}:${magazineName}:${attributeName}`, - Date.now(), - attributeValue, - ); - } -} diff --git a/crawler/src/lib/storage/mbin.ts b/crawler/src/lib/storage/mbin.ts index 5a497d5..bd2bee4 100644 --- a/crawler/src/lib/storage/mbin.ts +++ b/crawler/src/lib/storage/mbin.ts @@ -37,6 +37,7 @@ export type IMagazineData = { serverSoftwareVersion: any; isPostingRestrictedToMods: boolean; lastCrawled?: number; + baseurl: string; }; export type IMagazineDataKeyValue = { [key: string]: IMagazineData; @@ -50,7 +51,14 @@ export default class MBinStore { } async getAll(): Promise { - return this.storage.listRedis(`mbin_magazine:*`); + const magazineKeyValue = this.storage.listRedisWithKeys(`mbin_magazine:*`); + + // put baseUrl into the magazine object + for (const key in magazineKeyValue) { + magazineKeyValue[key].baseurl = key.split(":")[1]; + } + + return Object.values(magazineKeyValue); } async getAllWithKeys(): Promise { diff --git a/crawler/src/output/file_writer.ts b/crawler/src/output/file_writer.ts index bc24023..402c804 100644 --- a/crawler/src/output/file_writer.ts +++ b/crawler/src/output/file_writer.ts @@ -1,5 +1,7 @@ import path from "node:path"; -import { open, rm, mkdir } from "node:fs/promises"; +import { open, rm, mkdir, FileHandle } from "node:fs/promises"; + +import { IUptimeNodeData, IFullUptimeData } from "../lib/storage/uptime"; /** * OutputFileWriter - This class handles writing the output JSON files. @@ -10,6 +12,139 @@ import { open, rm, mkdir } from "node:fs/promises"; // love you all +export type IMetaDataOutput = { + instances: number; + communities: number; + mbin_instances: number; // @ NEW + magazines: number; + fediverse: number; + + time: number; + package: string; + version: string; + + linked?: any; + allowed?: any; + blocked?: any; +}; + +export type IInstanceDataOutput = { + baseurl: string; + url: string; + name: string; + desc: string; + downvotes: boolean; + nsfw: boolean; + create_admin: boolean; + private: boolean; + fed: boolean; + version: string; + open: boolean; + usage: number; + counts: Object; + icon: string; + banner: string; + langs: string[]; + date: string; + published: number; + time: number; + score: number; + uptime?: IUptimeNodeData; + isSuspicious: boolean; + metrics: Object | null; + tags: string[]; + susReason: string[]; + trust: []; + blocks: { + incoming: number; + outgoing: number; + }; + blocked: string[]; +}; + +export type ICommunityDataOutput = { + baseurl: string; + url: string; + name: string; + title: string; + desc: string; + icon: string | null; + banner: string | null; + nsfw: boolean; + counts: Object; + published: number; + time: number; + isSuspicious: boolean; + score: number; +}; + +export type IMBinInstanceOutput = { + // actor_id: string; + // title: string; + // name: string; + // preferred: string; + // baseurl: string; + // summary: string; + // sensitive: boolean; + // postingRestrictedToMods: boolean; + // icon: string; + // published: string; + // updated: string; + // followers: number; + // time: number; +}; + +export type IMBinMagazineOutput = { + baseUrl: string; + magazineId: number; + title: string; + name: string; + description: string; + isAdult: boolean; + postingRestrictedToMods: boolean; + icon: string; + subscriptions: number; + posts: number; + time: number; +}; + +export type IFediverseDataOutput = { + url: string; + software: string; + version: string; +}; + +export type IClassifiedErrorOutput = { + baseurl: string; + time: number; + error: string; + type?: string; +}; + +// type IInstanceOutput = {}; + +// // minified version, only enough for sort/filter +// // { +// // "base": "lemmy.ml", +// // "title": "Lemmy!", +// // "name": "lemmy", +// // "desc": "lemmy instance is cool and stuff!", +// // "sort": { +// // "score": 724, //smart sort +// // "subscribers": 1, +// // "users": "users_active_week", +// // "posts": 0, +// // "comments": 0, +// // } +// // } +// type IInstanceMinOutput = {}; +// type IInstanceMetaOutput = {}; + +// type ICommunityOutput = {}; +// type ICommunityMinOutput = {}; + +// type IMagazineOutput = {}; + // split communities.json and instances.json into smaller files for easier loading // community-index.json @@ -40,21 +175,6 @@ import { open, rm, mkdir } from "node:fs/promises"; // "score": 724 // } -// minified version, only enough for sort/filter -// { -// "base": "lemmy.ml", -// "title": "Lemmy!", -// "name": "lemmy", -// "desc": "lemmy instance is cool and stuff!", -// "sort": { -// "score": 724, //smart sort -// "subscribers": 1, -// "users": "users_active_week", -// "posts": 0, -// "comments": 0, -// } -// } - // instance-index.json // should do all the things needed to transform the redis data into data for frontend @@ -105,48 +225,10 @@ export default class OutputFileWriter { } } - /** - * this method is used to split the data into smaller files for easier loading - * - * @param {string} chunkName - the name of the chunk, used for the filename - * @param {number} perFile - how many entries per file - * @param {array} dataArray - the data array to split - */ - async storeChunkedData(chunkName: string, perFile: number, dataArray: any) { - await this.writeJsonFile(`${this.publicDataFolder}/${chunkName}.full.json`, JSON.stringify(dataArray)); - - // mapped versions and the metadata - await mkdir(path.join(this.publicDataFolder, chunkName), { - recursive: true, - }); - - let fileCount = 0; - for (let i = 0; i < dataArray.length; i += perFile) { - let chunk = dataArray.slice(i, i + perFile); - - await this.writeJsonFile( - `${this.publicDataFolder}/${chunkName}/${fileCount}.json`, - JSON.stringify(chunk), - ); - fileCount++; - } - - await this.writeJsonFile( - `${this.publicDataFolder}/${chunkName}.json`, - JSON.stringify({ - count: fileCount, - }), - ); - } - /** * this method is used to store the fediverse data - * - * @param {object} data - the fediverse data - * @param {object} softwareData - the fediverse software data - * @param {object} softwareBaseUrls - the fediverse software base urls */ - async storeFediverseData(data: any, softwareData: any, softwareBaseUrls: any, fediTags: any) { + public async storeFediverseData(data: any, softwareData: any, softwareBaseUrls: any, fediTags: any) { await this.writeJsonFile(`${this.publicDataFolder}/fediverse.json`, JSON.stringify(data)); await this.writeJsonFile( `${this.publicDataFolder}/fediverse_software_counts.json`, @@ -163,11 +245,8 @@ export default class OutputFileWriter { /** * this method is used to store the instance metrics data - * - * @param {string} instanceBaseUrl - the base url of the instance - * @param {object} data - the instance metrics data */ - async storeInstanceMetricsData(instanceBaseUrl: String, data: any) { + public async storeInstanceMetricsData(instanceBaseUrl: String, data: any) { await mkdir(this.metricsPath, { recursive: true, }); @@ -177,11 +256,9 @@ export default class OutputFileWriter { /** * this method is used to store the community metrics data - * - * @param {string} instanceBaseUrl - the base url of the instance - * @param {object} data - the instance metrics data */ - async storeCommunityMetricsData(instanceBaseUrl: string, communityData: any) { + public async storeCommunityMetricsData(instanceBaseUrl: string, communityData: any) { + // make sure the directory exists for the instance await mkdir(`${this.communityMetricsPath}/${instanceBaseUrl}`, { recursive: true, }); @@ -192,35 +269,72 @@ export default class OutputFileWriter { ); } - async storeMetaData(data: any) { + public async storeMetaData(data: IMetaDataOutput) { await this.writeJsonFile(`${this.publicDataFolder}/meta.json`, JSON.stringify(data)); } - async storeInstanceErrors(data: any) { + public async storeInstanceErrors(data: any) { await this.writeJsonFile(`${this.publicDataFolder}/instanceErrors.json`, JSON.stringify(data)); } - async storeSuspicousData(data: any) { + public async storeSuspicousData(data: any) { await this.writeJsonFile(`${this.publicDataFolder}/sus.json`, JSON.stringify(data)); } - async storeKbinInstanceList(data: any) { + // stores an array of the string baseUrl + public async storeMBinInstanceData(data: string[]) { await this.writeJsonFile(`${this.publicDataFolder}/kbin.min.json`, JSON.stringify(data)); } - async storeKBinMagazineData(data: any) { + public async storeMBinMagazineData(data: any) { await this.storeChunkedData("magazines", this.magazinesPerFile, data); } - async cleanData() { + /** + * this method is used to clean (delete all files) the data folder + */ + public async cleanData(): Promise { await rm(this.publicDataFolder, { recursive: true, force: true }); await mkdir(this.publicDataFolder, { recursive: true }); } - async writeJsonFile(filename: string, data: any) { - let filehandle: any = null; + /** + * this method is used to split the data into smaller files for easier loading + */ + private async storeChunkedData(chunkName: string, perFile: number, dataArray: []): Promise { + await this.writeJsonFile(`${this.publicDataFolder}/${chunkName}.full.json`, JSON.stringify(dataArray)); + + // mapped versions and the metadata + await mkdir(path.join(this.publicDataFolder, chunkName), { + recursive: true, + }); + + let fileCount = 0; + for (let i = 0; i < dataArray.length; i += perFile) { + let chunk = dataArray.slice(i, i + perFile); + + await this.writeJsonFile( + `${this.publicDataFolder}/${chunkName}/${fileCount}.json`, + JSON.stringify(chunk), + ); + fileCount++; + } + + await this.writeJsonFile( + `${this.publicDataFolder}/${chunkName}.json`, + JSON.stringify({ + count: fileCount, + }), + ); + } + + /** + * this method is used to write a JSON file + */ + private async writeJsonFile(fileName: string, data: string): Promise { + let filehandle: FileHandle | null = null; try { - filehandle = await open(filename, "w"); + filehandle = await open(fileName, "w"); await filehandle?.writeFile(data); } finally { diff --git a/crawler/src/output/output.ts b/crawler/src/output/output.ts index edb5cc4..f9dd6fb 100644 --- a/crawler/src/output/output.ts +++ b/crawler/src/output/output.ts @@ -10,9 +10,10 @@ import CrawlClient from "../lib/CrawlClient"; import storage from "../lib/crawlStorage"; import { IInstanceData, IInstanceDataKeyValue } from "../lib/storage/instance"; import { ICommunityData, ICommunityDataKeyValue } from "../lib/storage/community"; -import { IMagazineData, IMagazineDataKeyValue } from "../lib/storage/kbin"; +import { IMagazineData, IMagazineDataKeyValue } from "../lib/storage/mbin"; import { IFediverseData, IFediverseDataKeyValue } from "../lib/storage/fediverse"; // import { IFediseerInstanceData } from "../lib/storage/fediseer"; + import { IErrorData, IErrorDataKeyValue, @@ -21,88 +22,17 @@ import { } from "../lib/storage/tracking"; import { IUptimeNodeData, IFullUptimeData } from "../lib/storage/uptime"; -import OutputFileWriter from "./file_writer"; +import OutputFileWriter, { + IMetaDataOutput, + IInstanceDataOutput, + ICommunityDataOutput, + IMBinInstanceOutput, + IMBinMagazineOutput, + IFediverseDataOutput, + IClassifiedErrorOutput, +} from "./file_writer"; import OutputTrust from "./trust"; -export type IKBinMagazineOutput = { - actor_id: string; - title: string; - name: string; - preferred: string; - baseurl: string; - summary: string; - sensitive: boolean; - postingRestrictedToMods: boolean; - icon: string; - published: string; - updated: string; - followers: number; - time: number; -}; - -export type IFediverseDataOutput = { - url: string; - software: string; - version: string; -}; - -export type IClassifiedErrorOutput = { - baseurl: string; - time: number; - error: string; - type?: string; -}; - -export type ICommunityDataOutput = { - baseurl: string; - url: string; - name: string; - title: string; - desc: string; - icon: string | null; - banner: string | null; - nsfw: boolean; - counts: Object; - published: number; - time: number; - isSuspicious: boolean; - score: number; -}; - -export type IInstanceDataOutput = { - baseurl: string; - url: string; - name: string; - desc: string; - downvotes: boolean; - nsfw: boolean; - create_admin: boolean; - private: boolean; - fed: boolean; - version: string; - open: boolean; - usage: number; - counts: Object; - icon: string; - banner: string; - langs: string[]; - date: string; - published: number; - time: number; - score: number; - uptime?: IUptimeNodeData; - isSuspicious: boolean; - metrics: Object | null; - tags: string[]; - susReason: string[]; - trust: []; - blocks: { - incoming: number; - outgoing: number; - }; - blocked: string[]; -}; - class OutputUtils { // strip markdown, optionally substring static stripMarkdownSubStr(text: string, maxLength: number = -1) { @@ -225,8 +155,8 @@ class OutputUtils { previousRun, returnInstanceArray: IInstanceDataOutput[], returnCommunityArray: ICommunityDataOutput[], - kbinInstanceArray: string[], - kbinMagazineArray: IKBinMagazineOutput[], + mbinInstanceArray: string[], + mbinMagazineArray: IMBinMagazineOutput[], returnStats: IFediverseDataOutput[], ) { const issues: string[] = []; @@ -235,8 +165,8 @@ class OutputUtils { if ( returnInstanceArray.length === 0 || returnCommunityArray.length === 0 || - kbinInstanceArray.length === 0 || - kbinMagazineArray.length === 0 || + mbinInstanceArray.length === 0 || + mbinMagazineArray.length === 0 || returnStats.length === 0 ) { console.log("Empty Array"); @@ -305,16 +235,16 @@ class OutputUtils { old: previousRun.fediverse, }); - // @TODO kbin checks are disabled till scanning is fixed + // @TODO mbin checks are disabled till scanning is fixed // data.push({ // type: "magazines", - // new: kbinMagazineArray.length, + // new: mbinMagazineArray.length, // old: previousRun.magazines, // }); // data.push({ - // type: "kbin_instances", - // new: kbinInstanceArray.length, - // old: previousRun.kbin_instances, + // type: "mbin_instances", + // new: mbinInstanceArray.length, + // old: previousRun.mbin_instances, // }); for (let i = 0; i < data.length; i++) { @@ -354,7 +284,7 @@ export default class CrawlOutput { private instanceList: IInstanceData[] | null; private communityList: ICommunityData[] | null; private fediverseData: IFediverseDataKeyValue | null; - private kbinData: IMagazineData[] | null; + private mbinData: IMagazineData[] | null; private fileWriter: OutputFileWriter; private trust: OutputTrust; @@ -366,7 +296,7 @@ export default class CrawlOutput { this.instanceList = null; this.communityList = null; this.fediverseData = null; - this.kbinData = null; + this.mbinData = null; // this.utils = new OutputUtils(); @@ -382,7 +312,7 @@ export default class CrawlOutput { this.instanceList = await storage.instance.getAll(); this.communityList = await storage.community.getAll(); this.fediverseData = await storage.fediverse.getAll(); - this.kbinData = await storage.kbin.getAll(); + this.mbinData = await storage.mbin.getAll(); } /** @@ -403,8 +333,8 @@ export default class CrawlOutput { throw new Error("No fediverse Data"); } - if (!this.kbinData) { - throw new Error("No kbin Data"); + if (!this.mbinData) { + throw new Error("No mbin Data"); } // setup trust data @@ -447,9 +377,9 @@ export default class CrawlOutput { // fediverse data const returnStats = await this.outputFediverseData(returnInstanceArray); - // kbin data - const kbinInstanceArray = await this.outputKBinInstanceList(returnStats); - const kbinMagazineArray = await this.outputKBinMagazineList(); + // mbin data + const mbinInstanceArray = await this.outputMBinInstanceList(returnStats); + const mbinMagazineArray = await this.outputMBinMagazineList(); // error data const instanceErrors = await this.outputClassifiedErrors(); @@ -459,12 +389,11 @@ export default class CrawlOutput { (await readFile(new URL("../../package.json", import.meta.url))).toString(), ); - const metaData = { + const metaData: IMetaDataOutput = { instances: returnInstanceArray.length, communities: returnCommunityArray.length, - kbin_instances: kbinInstanceArray.length, - magazines: kbinMagazineArray.length, - // kbin_instances: kbinInstanceArray.length, + mbin_instances: mbinInstanceArray.length, + magazines: mbinMagazineArray.length, fediverse: returnStats.length, time: Date.now(), package: packageJson.name, @@ -508,20 +437,22 @@ export default class CrawlOutput { Previous: previousRun.communities, Change: calcChangeDisplay(returnCommunityArray.length, previousRun.communities), }, - KBinInstances: { - ExportName: "KBin Instances", + + MBinInstances: { + ExportName: "MBin Instances", Total: "N/A", - Output: kbinInstanceArray.length, - Previous: previousRun.kbin_instances, - Change: calcChangeDisplay(kbinInstanceArray.length, previousRun.kbin_instances), + Output: mbinInstanceArray.length, + Previous: previousRun.mbin_instances, + Change: calcChangeDisplay(mbinInstanceArray.length, previousRun.mbin_instances), }, Magazines: { ExportName: "Magazines", - Total: this.kbinData.length, - Output: kbinMagazineArray.length, + Total: this.mbinData.length, + Output: mbinMagazineArray.length, Previous: previousRun.magazines, - Change: calcChangeDisplay(kbinMagazineArray.length, previousRun.magazines), + Change: calcChangeDisplay(mbinMagazineArray.length, previousRun.magazines), }, + Fediverse: { ExportName: "Fediverse Servers", Total: "N/A", @@ -547,8 +478,8 @@ export default class CrawlOutput { previousRun, returnInstanceArray, returnCommunityArray, - kbinInstanceArray, - kbinMagazineArray, + mbinInstanceArray, + mbinMagazineArray, returnStats, ); @@ -1023,14 +954,14 @@ export default class CrawlOutput { return returnStats; } - // KBIN + // mbin - private async outputKBinInstanceList(returnStats: IFediverseDataOutput[]): Promise { - let kbinInstanceUrls: string[] = returnStats + private async outputMBinInstanceList(returnStats: IFediverseDataOutput[]): Promise { + let mbinInstanceUrls: string[] = returnStats .map((fediverse) => { // const fediverse = this.fediverseData[fediKey]; - if (fediverse.software && fediverse.software === "kbin") { + if (fediverse.software && fediverse.software === "mbin") { return fediverse.url; } @@ -1038,50 +969,51 @@ export default class CrawlOutput { }) .filter((instance) => instance !== null); - await this.fileWriter.storeKbinInstanceList(kbinInstanceUrls); + await this.fileWriter.storeMBinInstanceData(mbinInstanceUrls); - return kbinInstanceUrls; + return mbinInstanceUrls; } // generate a list of all the instances that are suspicious and the reasons - private async outputKBinMagazineList(): Promise { - const output: IKBinMagazineOutput[] = []; + private async outputMBinMagazineList(): Promise { + const output: IMBinMagazineOutput[] = []; - if (!this.kbinData) { - throw new Error("No KBin data"); + if (!this.mbinData) { + throw new Error("No MBin data"); } // filter old data - const filteredKBins = this.kbinData.filter((kbin) => { - return kbin.lastCrawled > Date.now() - OUTPUT_MAX_AGE.MAGAZINE; + const filteredMBins = this.mbinData.filter((mbin) => { + if (!mbin.lastCrawled) return false; // record needs time + return mbin.lastCrawled > Date.now() - OUTPUT_MAX_AGE.MAGAZINE; }); - logging.info("KBin Magazines filteredKBins", this.kbinData.length, filteredKBins.length); + logging.info("MBin Magazines filteredMBins", this.mbinData.length, filteredMBins.length); - for (const kbin of filteredKBins) { + for (const mbin of filteredMBins) { output.push({ - actor_id: kbin.id, - - title: kbin.title, // display name - name: kbin.name, // key username - preferred: kbin.preferredUsername, // username ?? + baseUrl: mbin.baseurl, + magazineId: mbin.magazineId, - baseurl: kbin.id.split("/")[2], + title: mbin.title, // display name + name: mbin.name, // key username + // preferred: mbin.preferredUsername, // username ?? - summary: OutputUtils.stripMarkdownSubStr(kbin.summary, 350), - sensitive: kbin.sensitive, - postingRestrictedToMods: kbin.postingRestrictedToMods, + description: OutputUtils.stripMarkdownSubStr(mbin.description, 350), + isAdult: mbin.isAdult, + postingRestrictedToMods: mbin.isPostingRestrictedToMods, - icon: kbin.icon ? kbin.icon.url : null, - published: kbin.published, - updated: kbin.updated, - followers: kbin.followerCount, + icon: mbin.icon ? mbin.icon.url : null, + // published: mbin.published, + // updated: mbin.updated, + subscriptions: mbin.subscriptionsCount, + posts: mbin.postCount, - time: kbin.lastCrawled || 0, + time: mbin.lastCrawled || 0, }); } - await this.fileWriter.storeKBinMagazineData(output); + await this.fileWriter.storeMBinMagazineData(output); return output; } diff --git a/crawler/src/util/aged.ts b/crawler/src/util/aged.ts index f5a04d7..e2d6526 100644 --- a/crawler/src/util/aged.ts +++ b/crawler/src/util/aged.ts @@ -3,7 +3,7 @@ import logging from "../lib/logging"; import InstanceQueue from "../queue/instance"; import CommunityQueue from "../queue/community_list"; import SingleCommunityQueue from "../queue/community_single"; -// import KBinQueue from "../queue/kbin"; +import MBinQueue from "../queue/mbin"; import storage from "../lib/crawlStorage"; @@ -15,7 +15,7 @@ export default class CrawlAged { private instanceCrawler: InstanceQueue; private communityCrawler: CommunityQueue; private singleCommunityCrawler: SingleCommunityQueue; - // private kbinCrawler: KBinQueue; + private mbinCrawler: MBinQueue; constructor() { this.agedInstanceBaseUrls = []; @@ -24,8 +24,8 @@ export default class CrawlAged { this.communityCrawler = new CommunityQueue(false); this.singleCommunityCrawler = new SingleCommunityQueue(false); - // @TODO scan for aged kbin magazines - // this.kbinCrawler = new KBinQueue(false); + // scan for aged magazines + this.mbinCrawler = new MBinQueue(false); } async recordAges() { @@ -33,7 +33,7 @@ export default class CrawlAged { const instances = await storage.instance.getAll(); const communities = await storage.community.getAll(); - const magazines = await storage.kbin.getAll(); + const magazines = await storage.mbin.getAll(); const fediverse = await storage.fediverse.getAll(); const errors = await storage.tracking.getAllErrors("*"); const lastCrawls = await storage.tracking.listAllLastCrawl(); @@ -219,7 +219,7 @@ export default class CrawlAged { return false; }); - logging.info("Aged Communities By Base", Object.keys(byBase).length); + logging.info("Aged Communities By Base (showing over 100 communities)", Object.keys(byBase).length); const baseCounts = Object.keys(byBase) .map((base) => { @@ -228,7 +228,8 @@ export default class CrawlAged { count: byBase[base].length, }; }) - .sort((a, b) => a.count - b.count); + .sort((a, b) => a.count - b.count) + .filter((a) => a.count > 100); console.table(baseCounts); @@ -252,6 +253,32 @@ export default class CrawlAged { await this.communityCrawler.createJob(baseUrl); } + // get aged magazines + const magazines = await storage.mbin.getAll(); + + const agedMagazines = Object.values(magazines).filter((magazine) => { + if (!magazine.lastCrawled) return true; // not set + + if (Date.now() - magazine.lastCrawled > CRAWL_AGED_TIME.MAGAZINE) { + return true; + } + + return false; + }); + + // get base url for each magazine + const agedMagazineBaseUrls = agedMagazines.map((magazine) => magazine.baseurl); + + // filter those dupes + const uniqueAgedMagazineBaseUrls = [...new Set(agedMagazineBaseUrls)]; + + logging.info( + `Magazines Total: ${magazines.length} Aged ${agedMagazineBaseUrls.length}, Total Instances: ${uniqueAgedMagazineBaseUrls.length}`, + ); + for (const baseUrl of uniqueAgedMagazineBaseUrls) { + this.mbinCrawler.createJob(baseUrl); + } + logging.info("Done Creating Aged Jobs"); } }