diff --git a/crawler/package.json b/crawler/package.json index 93d2172..d0c352a 100644 --- a/crawler/package.json +++ b/crawler/package.json @@ -3,13 +3,14 @@ "start": "cd src && node index.js" }, "dependencies": { - "cheerio": "^1.0.0-rc.10", + "cheerio": "^1.0.0-rc.12", "esm": "^3.2.25", + "node-fetch": "2.6.7", "node-fetch-cookies": "^2.0.3", - "puppeteer": "^10.2.0", - "puppeteer-autoscroll-down": "^0.1.7", - "puppeteer-extra": "^3.1.18", - "puppeteer-extra-plugin-block-resources": "^2.2.9", - "puppeteer-extra-plugin-stealth": "^2.7.8" + "puppeteer": "^19.4.1", + "puppeteer-autoscroll-down": "^1.1.1", + "puppeteer-extra": "^3.3.4", + "puppeteer-extra-plugin-block-resources": "^2.4.2", + "puppeteer-extra-plugin-stealth": "^2.11.1" } } diff --git a/crawler/src/crawlers/azuki.js b/crawler/src/crawlers/azuki.js index 8f3272a..f47aed3 100644 --- a/crawler/src/crawlers/azuki.js +++ b/crawler/src/crawlers/azuki.js @@ -2,13 +2,16 @@ const puppeteer = require('puppeteer-extra'); const StealthPlugin = require('puppeteer-extra-plugin-stealth'); const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); const sleep = require('../util/sleep'); +const { executablePath } = require('puppeteer'); puppeteer.use(StealthPlugin()); puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); module.exports = async () => { // azuki has no api, unfortunately - const browser = await puppeteer.launch(); + const browser = await puppeteer.launch({ + executablePath: executablePath() + }); // todo: automate this let currentPages = 4; diff --git a/crawler/src/crawlers/bilibili.js b/crawler/src/crawlers/bilibili.js index bb734c0..add1017 100644 --- a/crawler/src/crawlers/bilibili.js +++ b/crawler/src/crawlers/bilibili.js @@ -1,8 +1,9 @@ const puppeteer = require('puppeteer-extra'); const StealthPlugin = require('puppeteer-extra-plugin-stealth'); const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); -const scrollPageToBottom = require('puppeteer-autoscroll-down'); +const { scrollPageToBottom } = require('puppeteer-autoscroll-down'); const sleep = require('../util/sleep'); +const { executablePath } = require('puppeteer'); puppeteer.use(StealthPlugin()); puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); @@ -10,7 +11,9 @@ puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'fo module.exports = async () => { // until i can figure out how their api works, we will have to do with using puppeteer // slower, but works - const browser = await puppeteer.launch(); + const browser = await puppeteer.launch({ + executablePath: executablePath() + }); const page = await browser.newPage(); await page.goto('https://www.bilibilicomics.com/genre'); diff --git a/crawler/src/crawlers/bookwalker.js b/crawler/src/crawlers/bookwalker.js index 624e4f9..0ca79db 100644 --- a/crawler/src/crawlers/bookwalker.js +++ b/crawler/src/crawlers/bookwalker.js @@ -2,6 +2,7 @@ const puppeteer = require('puppeteer-extra'); const StealthPlugin = require('puppeteer-extra-plugin-stealth'); const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); const sleep = require('../util/sleep'); +const { executablePath } = require('puppeteer'); puppeteer.use(StealthPlugin()); puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); @@ -9,7 +10,9 @@ puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'fo module.exports = async (novel) => { // bookwalker also has no api, unfortunately // it also has 40 pages! do you have any idea how long it takes to load all this? - const browser = await puppeteer.launch(); + const browser = await puppeteer.launch({ + executablePath: executablePath() + }); // todo: automate this let currentPages = 40; diff --git a/crawler/src/crawlers/crunchyroll.js b/crawler/src/crawlers/crunchyroll.js index cbde818..697c238 100644 --- a/crawler/src/crawlers/crunchyroll.js +++ b/crawler/src/crawlers/crunchyroll.js @@ -2,13 +2,16 @@ const puppeteer = require('puppeteer-extra'); const cheerio = require('cheerio'); const StealthPlugin = require('puppeteer-extra-plugin-stealth'); const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); +const { executablePath } = require('puppeteer'); puppeteer.use(StealthPlugin()); puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); module.exports = async () => { // crunchyroll uses cloudflare so we try to not be a robot - const browser = await puppeteer.launch(); + const browser = await puppeteer.launch({ + executablePath: executablePath() + }); const page = await browser.newPage(); await page.goto('https://www.crunchyroll.com/comics/manga/alpha?group=all'); // todo: stop using cheerio diff --git a/crawler/src/crawlers/lezhinus.js b/crawler/src/crawlers/lezhinus.js index 6903a6c..eec84ed 100644 --- a/crawler/src/crawlers/lezhinus.js +++ b/crawler/src/crawlers/lezhinus.js @@ -2,13 +2,15 @@ const puppeteer = require('puppeteer-extra'); const StealthPlugin = require('puppeteer-extra-plugin-stealth'); const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); const sleep = require('../util/sleep'); +const { executablePath } = require('puppeteer'); puppeteer.use(StealthPlugin()); puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); module.exports = async () => { const browser = await puppeteer.launch({ - headless: false + headless: false, + executablePath: executablePath() }); // todo: automate this diff --git a/crawler/src/crawlers/mangaplanet.js b/crawler/src/crawlers/mangaplanet.js index be3322e..336543d 100644 --- a/crawler/src/crawlers/mangaplanet.js +++ b/crawler/src/crawlers/mangaplanet.js @@ -2,13 +2,16 @@ const puppeteer = require('puppeteer-extra'); const StealthPlugin = require('puppeteer-extra-plugin-stealth'); const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); const sleep = require('../util/sleep'); +const { executablePath } = require('puppeteer'); puppeteer.use(StealthPlugin()); puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); module.exports = async () => { // azuki has no api, unfortunately - const browser = await puppeteer.launch(); + const browser = await puppeteer.launch({ + executablePath: executablePath() + }); // todo: automate this let currentPages = 4; diff --git a/crawler/src/crawlers/mangaplus.js b/crawler/src/crawlers/mangaplus.js index d103adb..13fcd0c 100644 --- a/crawler/src/crawlers/mangaplus.js +++ b/crawler/src/crawlers/mangaplus.js @@ -1,7 +1,8 @@ const puppeteer = require('puppeteer-extra'); const StealthPlugin = require('puppeteer-extra-plugin-stealth'); const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); -const scrollPageToBottom = require('puppeteer-autoscroll-down'); +const { scrollPageToBottom } = require('puppeteer-autoscroll-down'); +const { executablePath } = require('puppeteer'); puppeteer.use(StealthPlugin()); puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); @@ -9,7 +10,9 @@ puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'fo module.exports = async () => { // until i can figure out how their api works, we will have to do with using puppeteer // slower, but works - const browser = await puppeteer.launch(); + const browser = await puppeteer.launch({ + executablePath: executablePath() + }); const page = await browser.newPage(); await page.goto('https://mangaplus.shueisha.co.jp/manga_list/all'); diff --git a/crawler/src/crawlers/myanimelist.js b/crawler/src/crawlers/myanimelist.js index 61ffed0..36e0b6d 100644 --- a/crawler/src/crawlers/myanimelist.js +++ b/crawler/src/crawlers/myanimelist.js @@ -2,6 +2,7 @@ const puppeteer = require('puppeteer-extra'); const StealthPlugin = require('puppeteer-extra-plugin-stealth'); const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); const sleep = require('../util/sleep'); +const { executablePath } = require('puppeteer'); puppeteer.use(StealthPlugin()); puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); @@ -9,7 +10,8 @@ puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'fo module.exports = async () => { // MAL api is garbage/non-existant const browser = await puppeteer.launch({ - headless: false + headless: false, + executablePath: executablePath() }); // todo: automate this diff --git a/crawler/src/crawlers/renta.js b/crawler/src/crawlers/renta.js index e5b12b9..4129b8a 100644 --- a/crawler/src/crawlers/renta.js +++ b/crawler/src/crawlers/renta.js @@ -2,13 +2,16 @@ const puppeteer = require('puppeteer-extra'); const StealthPlugin = require('puppeteer-extra-plugin-stealth'); const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); const sleep = require('../util/sleep'); +const { executablePath } = require('puppeteer'); puppeteer.use(StealthPlugin()); puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); module.exports = async () => { // renta has no api, unfortunately - const browser = await puppeteer.launch(); + const browser = await puppeteer.launch({ + executablePath: executablePath() + }); // todo: automate this let categories = [{ diff --git a/crawler/src/crawlinterval.txt b/crawler/src/crawlinterval.txt index a6802d4..65c56bf 100644 --- a/crawler/src/crawlinterval.txt +++ b/crawler/src/crawlinterval.txt @@ -1 +1 @@ -1628607072198 \ No newline at end of file +1673093712653 \ No newline at end of file diff --git a/crawler/src/index.js b/crawler/src/index.js index 4050a68..463ecc5 100644 --- a/crawler/src/index.js +++ b/crawler/src/index.js @@ -10,7 +10,7 @@ if (!fs.existsSync('./data.json')) { let data = require('./data.json'); if (data.length === 0) { console.info('Crawl started..'); - crawl(log).then(() => { + crawl().then(() => { // refresh data delete require.cache[require.resolve('./data.json')]; data = require('./data.json'); diff --git a/crawler/src/util/crawl.js b/crawler/src/util/crawl.js index 229439f..dd13b9f 100644 --- a/crawler/src/util/crawl.js +++ b/crawler/src/util/crawl.js @@ -8,29 +8,29 @@ const mangaplus = require('../crawlers/mangaplus.js'); const azuki = require('../crawlers/azuki.js'); const bookwalker = require('../crawlers/bookwalker.js'); -module.exports = async (log) => { +module.exports = async () => { let newArray = []; // this is my personal edge user agent const ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36 Edg/92.0.902.62'; // node-fetch newArray = newArray.concat(await viz(ua)); - log.info('Crawl Viz done'); + console.info('Crawl Viz done'); newArray = newArray.concat(await netcomics(ua)); - log.info('Crawl Netcomics done'); + console.info('Crawl Netcomics done'); newArray = newArray.concat(await inky(ua)); - log.info('Crawl Inky done'); + console.info('Crawl Inky done'); // puppeteer newArray = newArray.concat(await crunchy()); - log.info('Crawl Crunchyroll done'); + console.info('Crawl Crunchyroll done'); newArray = newArray.concat(await mangaplus()); - log.info('Crawl MangaPlus done'); + console.info('Crawl MangaPlus done'); newArray = newArray.concat(await azuki()); - log.info('Crawl Azuki done'); + console.info('Crawl Azuki done'); newArray = newArray.concat(await bookwalker()); - log.info('Crawl Bookwalker done'); + console.info('Crawl Bookwalker done'); fs.writeFileSync('./data.json', JSON.stringify(newArray)); - log.info('Crawl write file done'); + console.info('Crawl write file done'); Promise.resolve('Success'); };