From 616b43b70c009b70536311a0345d32ee90bef161 Mon Sep 17 00:00:00 2001 From: David Ralph Date: Sat, 7 Jan 2023 11:59:51 +0000 Subject: [PATCH] make crawler opensource and convert to monorepo --- .gitignore | 4 ++ README.md | 4 +- backend/api/404.js | 15 ++++++ backend/api/index.js | 14 +++++ backend/api/mal.js | 26 ++++++++++ backend/api/random.js | 33 ++++++++++++ backend/api/search.js | 39 ++++++++++++++ backend/config.json | 7 +++ backend/package.json | 7 +++ backend/struct/ratelimiter.js | 6 +++ backend/vercel.json | 24 +++++++++ crawler/package.json | 15 ++++++ crawler/src/config.json | 11 ++++ crawler/src/crawlers/azuki.js | 42 +++++++++++++++ crawler/src/crawlers/bilibili.js | 40 +++++++++++++++ crawler/src/crawlers/bookwalker.js | 48 +++++++++++++++++ crawler/src/crawlers/comicwalker.js | 35 +++++++++++++ crawler/src/crawlers/crunchyroll.js | 30 +++++++++++ crawler/src/crawlers/inkr.js | 16 ++++++ crawler/src/crawlers/inky.js | 25 +++++++++ crawler/src/crawlers/j-novel.js | 19 +++++++ crawler/src/crawlers/lezhinus.js | 48 +++++++++++++++++ crawler/src/crawlers/mangaplanet.js | 42 +++++++++++++++ crawler/src/crawlers/mangaplus.js | 41 +++++++++++++++ crawler/src/crawlers/myanimelist.js | 48 +++++++++++++++++ crawler/src/crawlers/netcomics.js | 14 +++++ crawler/src/crawlers/renta.js | 51 +++++++++++++++++++ crawler/src/crawlers/toomics.js | 16 ++++++ crawler/src/crawlers/viz.js | 23 +++++++++ crawler/src/crawlinterval.txt | 1 + crawler/src/index.js | 31 +++++++++++ crawler/src/test.js | 7 +++ crawler/src/util/crawl.js | 36 +++++++++++++ crawler/src/util/crawlLightNovels.js | 22 ++++++++ crawler/src/util/crawlTest.js | 17 +++++++ crawler/src/util/interval.js | 30 +++++++++++ crawler/src/util/localStorage.js | 14 +++++ crawler/src/util/sleep.js | 4 ++ package.json => frontend/package.json | 0 {public => frontend/public}/index.html | 0 {public => frontend/public}/manifest.json | 0 {src => frontend/src}/App.jsx | 0 {src => frontend/src}/components/Footer.jsx | 0 {src => frontend/src}/components/Navbar.jsx | 0 {src => frontend/src}/components/Random.jsx | 0 {src => frontend/src}/components/Results.jsx | 0 {src => frontend/src}/index.js | 0 {src => frontend/src}/modules/constants.js | 0 {src => frontend/src}/modules/placeholders.js | 0 {src => frontend/src}/scss/index.scss | 0 .../src}/scss/modules/_cards.scss | 0 {src => frontend/src}/scss/modules/_dark.scss | 0 .../src}/scss/modules/_dropdown.scss | 0 .../src}/scss/modules/_navbar.scss | 0 {src => frontend/src}/service-worker.js | 0 .../src}/serviceWorkerRegistration.js | 0 56 files changed, 902 insertions(+), 3 deletions(-) create mode 100644 backend/api/404.js create mode 100644 backend/api/index.js create mode 100644 backend/api/mal.js create mode 100644 backend/api/random.js create mode 100644 backend/api/search.js create mode 100644 backend/config.json create mode 100644 backend/package.json create mode 100644 backend/struct/ratelimiter.js create mode 100644 backend/vercel.json create mode 100644 crawler/package.json create mode 100644 crawler/src/config.json create mode 100644 crawler/src/crawlers/azuki.js create mode 100644 crawler/src/crawlers/bilibili.js create mode 100644 crawler/src/crawlers/bookwalker.js create mode 100644 crawler/src/crawlers/comicwalker.js create mode 100644 crawler/src/crawlers/crunchyroll.js create mode 100644 crawler/src/crawlers/inkr.js create mode 100644 crawler/src/crawlers/inky.js create mode 100644 crawler/src/crawlers/j-novel.js create mode 100644 crawler/src/crawlers/lezhinus.js create mode 100644 crawler/src/crawlers/mangaplanet.js create mode 100644 crawler/src/crawlers/mangaplus.js create mode 100644 crawler/src/crawlers/myanimelist.js create mode 100644 crawler/src/crawlers/netcomics.js create mode 100644 crawler/src/crawlers/renta.js create mode 100644 crawler/src/crawlers/toomics.js create mode 100644 crawler/src/crawlers/viz.js create mode 100644 crawler/src/crawlinterval.txt create mode 100644 crawler/src/index.js create mode 100644 crawler/src/test.js create mode 100644 crawler/src/util/crawl.js create mode 100644 crawler/src/util/crawlLightNovels.js create mode 100644 crawler/src/util/crawlTest.js create mode 100644 crawler/src/util/interval.js create mode 100644 crawler/src/util/localStorage.js create mode 100644 crawler/src/util/sleep.js rename package.json => frontend/package.json (100%) rename {public => frontend/public}/index.html (100%) rename {public => frontend/public}/manifest.json (100%) rename {src => frontend/src}/App.jsx (100%) rename {src => frontend/src}/components/Footer.jsx (100%) rename {src => frontend/src}/components/Navbar.jsx (100%) rename {src => frontend/src}/components/Random.jsx (100%) rename {src => frontend/src}/components/Results.jsx (100%) rename {src => frontend/src}/index.js (100%) rename {src => frontend/src}/modules/constants.js (100%) rename {src => frontend/src}/modules/placeholders.js (100%) rename {src => frontend/src}/scss/index.scss (100%) rename {src => frontend/src}/scss/modules/_cards.scss (100%) rename {src => frontend/src}/scss/modules/_dark.scss (100%) rename {src => frontend/src}/scss/modules/_dropdown.scss (100%) rename {src => frontend/src}/scss/modules/_navbar.scss (100%) rename {src => frontend/src}/service-worker.js (100%) rename {src => frontend/src}/serviceWorkerRegistration.js (100%) diff --git a/.gitignore b/.gitignore index d935e6b..ca1238a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,13 @@ # Folders build/ +.vercel/ .vscode/ .history/ node_modules/ # Files yarn.lock +data.json +dataTest.json +pnpm-lock.yaml package-lock.json diff --git a/README.md b/README.md index b37cb48..40171f0 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,7 @@ As of the latest update, support for manhwa and manhua has been added. You can choose between manga, light novels, manhwa or manhua from the dropdown on the main page. ## About -I built Manga Pages to solve the issue of finding which legal service provides a manga/light novel. Search engines don't always show this information (and often show lots of ad-filled unofficial sites). The site provides a simple hub to finding what you want to read next showing which sites have the manga/light novel available and a MyAnimeList button to read information about it. Think of this service as like [because.moe](https://because.moe) but for manga/light novels. The source code for the backend can be found -[here](https://github.com/davidcralph/mangapages-backend). - +I built Manga Pages to solve the issue of finding which legal service provides a manga/light novel. Search engines don't always show this information (and often show lots of ad-filled unofficial sites). The site provides a simple hub to finding what you want to read next showing which sites have the manga/light novel available and a MyAnimeList button to read information about it. Think of this service as like [because.moe](https://because.moe) but for manga/light novels. ## Features 🔍 Fast search with support for [multiple websites](#supported-websites) diff --git a/backend/api/404.js b/backend/api/404.js new file mode 100644 index 0000000..941ab26 --- /dev/null +++ b/backend/api/404.js @@ -0,0 +1,15 @@ +const rateLimit = require('../struct/ratelimiter'); + +module.exports = async (req, res) => { + try { + await rateLimit(30, req.headers['x-real-ip']); + } catch (error) { + return res.status(429).send({ + message: 'Too many requests' + }); + } + + return res.status(404).send({ + message: 'Not found' + }); +}; diff --git a/backend/api/index.js b/backend/api/index.js new file mode 100644 index 0000000..71a0b10 --- /dev/null +++ b/backend/api/index.js @@ -0,0 +1,14 @@ +const config = require('../config.json'); +const rateLimit = require('../struct/ratelimiter'); + +module.exports = async (req, res) => { + try { + await rateLimit(30, req.headers['x-real-ip']); + } catch (error) { + return res.status(429).send({ + message: 'Too many requests' + }); + } + + return res.status(200).send(config.helloworld); +}; diff --git a/backend/api/mal.js b/backend/api/mal.js new file mode 100644 index 0000000..db0509b --- /dev/null +++ b/backend/api/mal.js @@ -0,0 +1,26 @@ +const fetch = require('node-fetch'); +const rateLimit = require('../struct/ratelimiter'); + +module.exports = async (req, res) => { + try { + await rateLimit(50, req.headers['x-real-ip']); + } catch (error) { + return res.status(429).send({ + message: 'Too many requests', + }); + } + + if (!req.query.slug) { + return res.status(401).send({ + message: 'Input param required', + }); + } + + const data = await ( + await fetch( + `https://myanimelist.net/search/prefix.json?type=manga&keyword=${req.query.slug}` + ) + ).json(); + + return res.redirect(data.categories[0].items[0].url, 302); +}; diff --git a/backend/api/random.js b/backend/api/random.js new file mode 100644 index 0000000..a7c8483 --- /dev/null +++ b/backend/api/random.js @@ -0,0 +1,33 @@ +const data = require('../data.json'); +const dataLightNovel = require('../dataLightNovel.json'); +const dataManhwa = require('../dataManhwa.json'); +const dataManhua = require('../dataManhua.json'); +const rateLimit = require('../struct/ratelimiter'); + +module.exports = async (req, res) => { + try { + await rateLimit(100, req.headers['x-real-ip']); + } catch (error) { + return res.status(429).send({ + message: 'Too many requests', + }); + } + + let use = data; + switch (req.query.type) { + case 'lightnovel': + use = dataLightNovel; + break; + case 'manhwa': + use = dataManhwa; + break; + case 'manhua': + use = dataManhua; + break; + default: + break; + } + + res.setHeader('Access-Control-Allow-Origin', '*'); + return res.status(200).send(use.sort(() => Math.random() - 0.5).slice(-4)); +}; diff --git a/backend/api/search.js b/backend/api/search.js new file mode 100644 index 0000000..328d41f --- /dev/null +++ b/backend/api/search.js @@ -0,0 +1,39 @@ +const data = require('../data.json'); +const dataLightNovel = require('../dataLightNovel.json'); +const dataManhwa = require('../dataManhwa.json'); +const dataManhua = require('../dataManhua.json'); +const { matchSorter } = require('match-sorter'); +const rateLimit = require('../struct/ratelimiter'); + +module.exports = async (req, res) => { + try { + await rateLimit(500, req.headers['x-real-ip']); + } catch (error) { + return res.status(429).send({ + message: 'Too many requests' + }); + } + + let use = data; + switch (req.query.type) { + case 'lightnovel': + use = dataLightNovel; + break; + case 'manhwa': + use = dataManhwa; + break; + case 'manhua': + use = dataManhua; + break; + default: + break; + } + + const mangaResults = req.query.input ? matchSorter(use, req.query.input, { + keys: ['title', 'site'], + threshold: matchSorter.rankings.WORD_STARTS_WITH + }) : { message: 'Input query required' }; + + res.setHeader('Access-Control-Allow-Origin', '*'); + return res.status(200).send(mangaResults.slice(0, 300)); +}; diff --git a/backend/config.json b/backend/config.json new file mode 100644 index 0000000..9d8082b --- /dev/null +++ b/backend/config.json @@ -0,0 +1,7 @@ +{ + "helloworld": { + "version": "1.0.0", + "message": "Hello World!" + }, + "ratelimit_time": 60 +} diff --git a/backend/package.json b/backend/package.json new file mode 100644 index 0000000..79c476c --- /dev/null +++ b/backend/package.json @@ -0,0 +1,7 @@ +{ + "dependencies": { + "lambda-rate-limiter": "^3.0.1", + "match-sorter": "^6.3.1", + "node-fetch": "^2.6.7" + } +} diff --git a/backend/struct/ratelimiter.js b/backend/struct/ratelimiter.js new file mode 100644 index 0000000..b197fee --- /dev/null +++ b/backend/struct/ratelimiter.js @@ -0,0 +1,6 @@ +const config = require('../config.json'); +const ratelimit = require('lambda-rate-limiter'); + +module.exports = ratelimit({ + interval: config.ratelimit.time * 1000 +}).check; diff --git a/backend/vercel.json b/backend/vercel.json new file mode 100644 index 0000000..11efad9 --- /dev/null +++ b/backend/vercel.json @@ -0,0 +1,24 @@ +{ + "rewrites": [ + { + "source": "/", + "destination": "/api/index" + }, + { + "source": "/random", + "destination": "/api/random" + }, + { + "source": "/search", + "destination": "/api/search" + }, + { + "source": "/mal/:slug", + "destination": "/api/mal?slug=:slug" + }, + { + "source": "/(.*)", + "destination": "/api/404" + } + ] +} diff --git a/crawler/package.json b/crawler/package.json new file mode 100644 index 0000000..93d2172 --- /dev/null +++ b/crawler/package.json @@ -0,0 +1,15 @@ +{ + "scripts": { + "start": "cd src && node index.js" + }, + "dependencies": { + "cheerio": "^1.0.0-rc.10", + "esm": "^3.2.25", + "node-fetch-cookies": "^2.0.3", + "puppeteer": "^10.2.0", + "puppeteer-autoscroll-down": "^0.1.7", + "puppeteer-extra": "^3.1.18", + "puppeteer-extra-plugin-block-resources": "^2.2.9", + "puppeteer-extra-plugin-stealth": "^2.7.8" + } +} diff --git a/crawler/src/config.json b/crawler/src/config.json new file mode 100644 index 0000000..b9519b1 --- /dev/null +++ b/crawler/src/config.json @@ -0,0 +1,11 @@ +{ + "port": 80, + "ratelimit": { + "max": 500, + "per": "1 minute" + }, + "log": { + "level": "info", + "prettyPrint": true + } +} \ No newline at end of file diff --git a/crawler/src/crawlers/azuki.js b/crawler/src/crawlers/azuki.js new file mode 100644 index 0000000..8f3272a --- /dev/null +++ b/crawler/src/crawlers/azuki.js @@ -0,0 +1,42 @@ +const puppeteer = require('puppeteer-extra'); +const StealthPlugin = require('puppeteer-extra-plugin-stealth'); +const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); +const sleep = require('../util/sleep'); + +puppeteer.use(StealthPlugin()); +puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); + +module.exports = async () => { + // azuki has no api, unfortunately + const browser = await puppeteer.launch(); + + // todo: automate this + let currentPages = 4; + let newArray = []; + + const page = await browser.newPage(); + + for (let i = 0; i < currentPages; i++) { + await sleep(); + await page.goto('https://www.azuki.co/series/' + (i + 1)); + + const list = await page.evaluate(() => { + const titles = document.querySelectorAll('.a-card-link'); + let array = []; + + titles.forEach(title => { + array.push({ + title: title.innerText, + site: 'azuki', + url: 'https://www.azuki.co/series/' + title.innerText.toLowerCase().replaceAll(' ', '-').replace(/[^a-zA-Z ]/g, '') + }); + }); + return array; + }); + newArray = newArray.concat(list); + } + + await browser.close(); + + return newArray; +}; diff --git a/crawler/src/crawlers/bilibili.js b/crawler/src/crawlers/bilibili.js new file mode 100644 index 0000000..bb734c0 --- /dev/null +++ b/crawler/src/crawlers/bilibili.js @@ -0,0 +1,40 @@ +const puppeteer = require('puppeteer-extra'); +const StealthPlugin = require('puppeteer-extra-plugin-stealth'); +const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); +const scrollPageToBottom = require('puppeteer-autoscroll-down'); +const sleep = require('../util/sleep'); + +puppeteer.use(StealthPlugin()); +puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); + +module.exports = async () => { + // until i can figure out how their api works, we will have to do with using puppeteer + // slower, but works + const browser = await puppeteer.launch(); + const page = await browser.newPage(); + await page.goto('https://www.bilibilicomics.com/genre'); + + await scrollPageToBottom(page); + await sleep(); + await scrollPageToBottom(page); + + const list = await page.evaluate(() => { + let newArray = []; + // this will probably need changing often + const divs = document.querySelectorAll('.text-info-section'); + + divs.forEach(div => { + newArray.push({ + title: div.querySelector('.manga-title').innerText, + site: 'bilibili', + url: div.querySelector('a').href, + }); + }); + + return newArray; + }); + + await browser.close(); + + return list; +}; diff --git a/crawler/src/crawlers/bookwalker.js b/crawler/src/crawlers/bookwalker.js new file mode 100644 index 0000000..624e4f9 --- /dev/null +++ b/crawler/src/crawlers/bookwalker.js @@ -0,0 +1,48 @@ +const puppeteer = require('puppeteer-extra'); +const StealthPlugin = require('puppeteer-extra-plugin-stealth'); +const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); +const sleep = require('../util/sleep'); + +puppeteer.use(StealthPlugin()); +puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); + +module.exports = async (novel) => { + // bookwalker also has no api, unfortunately + // it also has 40 pages! do you have any idea how long it takes to load all this? + const browser = await puppeteer.launch(); + + // todo: automate this + let currentPages = 40; + let newArray = []; + + let baseUrl = 'https://global.bookwalker.jp/categories/2/?np=0&page='; + if (novel === true) { + baseUrl = 'https://global.bookwalker.jp/categories/3/?np=0&page='; + currentPages = 7; + } + + const page = await browser.newPage(); + for (let i = 0; i < currentPages; i++) { + await sleep(); + await page.goto(baseUrl + (i + 1)); + + const list = await page.evaluate(() => { + const titles = document.querySelectorAll('.a-tile-ttl a'); + let array = []; + + titles.forEach(title => { + array.push({ + title: title.innerText, + site: 'bookwalker', + url: title.href + }); + }); + return array; + }); + newArray = newArray.concat(list); + } + + await browser.close(); + + return newArray; +}; diff --git a/crawler/src/crawlers/comicwalker.js b/crawler/src/crawlers/comicwalker.js new file mode 100644 index 0000000..65464e3 --- /dev/null +++ b/crawler/src/crawlers/comicwalker.js @@ -0,0 +1,35 @@ +const esmImport = require('esm')(module); +const { CookieJar, fetch } = esmImport('node-fetch-cookies'); +const cheerio = require('cheerio'); +const sleep = require('../util/sleep'); +const fs = require('fs'); + +module.exports = async (ua) => { + if (!fs.existsSync('../comicwalker.json')) { + fs.writeFileSync('../comicwalker.json', '[]'); + } + const cookieJar = new CookieJar(); + + await (await fetch(cookieJar, 'https://comic-walker.com/set_lang/en/', { headers: { 'user-agent': ua }})); + const data = await (await fetch(cookieJar, 'https://comic-walker.com/', { headers: { 'user-agent': ua }})).text(); + const $ = cheerio.load(data); + let categories = []; + $('#sideMagazineLabel a').each((i, el) => categories.push('https://comic-walker.com' + $(el).attr('href'))); + await sleep(); + categories.forEach(async (category) => { + await sleep(); + const data2 = await (await fetch(cookieJar, category, { headers: { 'user-agent': ua }})).text(); + const $$ = cheerio.load(data2); + let array = JSON.parse(fs.readFileSync('../comicwalker.json')); + $$('.tileList li').each((i, el) => { + array.push({ + title: $(el).children('a').children('h3').children('span').text(), + site: 'comicwalker', + url: 'https://comic-walker.com' + $(el).children('a').attr('href'), + }); + }); + fs.writeFileSync('../comicwalker.json', JSON.stringify(array)); + }); + + return fs.readFileSync('../comicwalker.json', 'utf8'); +}; diff --git a/crawler/src/crawlers/crunchyroll.js b/crawler/src/crawlers/crunchyroll.js new file mode 100644 index 0000000..cbde818 --- /dev/null +++ b/crawler/src/crawlers/crunchyroll.js @@ -0,0 +1,30 @@ +const puppeteer = require('puppeteer-extra'); +const cheerio = require('cheerio'); +const StealthPlugin = require('puppeteer-extra-plugin-stealth'); +const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); + +puppeteer.use(StealthPlugin()); +puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); + +module.exports = async () => { + // crunchyroll uses cloudflare so we try to not be a robot + const browser = await puppeteer.launch(); + const page = await browser.newPage(); + await page.goto('https://www.crunchyroll.com/comics/manga/alpha?group=all'); + // todo: stop using cheerio + const $ = cheerio.load(await page.content()); + + let newArray = []; + $('.group-item').each((_i, title) => { + const text = $(title).text(); + newArray.push({ + title: text.replace('\\n', '').trim(), + site: 'crunchyroll', + url: 'https://www.crunchyroll.com/comics/manga/' + text.replace('\\n', '').toLowerCase().trim() + '/volumes' + }); + }); + + await browser.close(); + + return newArray; +}; diff --git a/crawler/src/crawlers/inkr.js b/crawler/src/crawlers/inkr.js new file mode 100644 index 0000000..c5f716f --- /dev/null +++ b/crawler/src/crawlers/inkr.js @@ -0,0 +1,16 @@ +const fetch = require('node-fetch'); + +module.exports = async (ua) => { + const object = await (await fetch('https://inkr.com/_next/data/_IeSY9KOuROEerSslZgux/genres/Manga.json', { headers: { 'user-agent': ua }})).json(); + let newArray = []; + object.pageProps.collectionTitleDetails.forEach(category => { + category.titles.forEach(item => { + newArray.push({ + title: item.name, + site: 'inkr', + url: 'https://inkr.com/' + item.oid + }); + }); + }); + return newArray; +}; diff --git a/crawler/src/crawlers/inky.js b/crawler/src/crawlers/inky.js new file mode 100644 index 0000000..d471d25 --- /dev/null +++ b/crawler/src/crawlers/inky.js @@ -0,0 +1,25 @@ +const fetch = require('node-fetch'); +const sleep = require('../util/sleep.js'); + +module.exports = async (ua) => { + // user agent is set so we don't look like a bot and get IP banned + // worst case scenario is the site blocks outside requests and that will suck + const data = await (await fetch('https://inky-pen.com/catalog/get?page=0&typeID=2', { headers: { 'user-agent': ua }})).json(); + let count = 0; + let newArray = []; + sleep(); + for (let i = 0; i < data.totalPages; i++) { + await sleep(); + const data = await (await fetch(`https://inky-pen.com/catalog/get?page=${count}&typeID=2`, { headers: { 'user-agent': ua }})).json(); + const object = data.viewModels; + object.forEach(item => { + newArray.push({ + title: item.title, + site: 'inky', + url: 'https://www.nintendo.com/games/detail/inkypen-switch/' + }); + }); + count++; + } + return newArray; +}; diff --git a/crawler/src/crawlers/j-novel.js b/crawler/src/crawlers/j-novel.js new file mode 100644 index 0000000..4421624 --- /dev/null +++ b/crawler/src/crawlers/j-novel.js @@ -0,0 +1,19 @@ +const fetch = require('node-fetch'); + +module.exports = async (ua, novel) => { + let url = 'https://api.j-novel.club/api/mangaSeries'; + if (novel === true) { + url = 'https://api.j-novel.club/api/series'; + } + + const object = await (await fetch(url, { headers: { 'user-agent': ua }})).json(); + let newArray = []; + object.forEach(item => { + newArray.push({ + title: item.title, + site: 'j-novel', + url: 'https://j-novel.club/series/' + item.titleslug + }); + }); + return newArray; +}; diff --git a/crawler/src/crawlers/lezhinus.js b/crawler/src/crawlers/lezhinus.js new file mode 100644 index 0000000..6903a6c --- /dev/null +++ b/crawler/src/crawlers/lezhinus.js @@ -0,0 +1,48 @@ +const puppeteer = require('puppeteer-extra'); +const StealthPlugin = require('puppeteer-extra-plugin-stealth'); +const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); +const sleep = require('../util/sleep'); + +puppeteer.use(StealthPlugin()); +puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); + +module.exports = async () => { + const browser = await puppeteer.launch({ + headless: false + }); + + // todo: automate this + let currentPages = 4; + let newArray = []; + + const page = await browser.newPage(); + + for (let i = 0; i < currentPages; i++) { + await sleep(); + await page.goto(`https://www.lezhinus.com/en/general?page=${i + 1}&sub_tags=all`); + + const list = await page.evaluate(() => { + const items = document.querySelectorAll('.lzComic__item'); + let array = []; + + items.forEach(item => { + try { + array.push({ + title: item.querySelector('.lzComic__title').innerText, + site: 'lezhinus', + url: item.querySelector('.lzComic__link').href + }); + } catch (e) { + // do nothing + } + }); + return array; + }); + console.log(list) + newArray = newArray.concat(list); + } + + await browser.close(); + + return newArray; +}; diff --git a/crawler/src/crawlers/mangaplanet.js b/crawler/src/crawlers/mangaplanet.js new file mode 100644 index 0000000..be3322e --- /dev/null +++ b/crawler/src/crawlers/mangaplanet.js @@ -0,0 +1,42 @@ +const puppeteer = require('puppeteer-extra'); +const StealthPlugin = require('puppeteer-extra-plugin-stealth'); +const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); +const sleep = require('../util/sleep'); + +puppeteer.use(StealthPlugin()); +puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); + +module.exports = async () => { + // azuki has no api, unfortunately + const browser = await puppeteer.launch(); + + // todo: automate this + let currentPages = 4; + let newArray = []; + + const page = await browser.newPage(); + + for (let i = 0; i < currentPages; i++) { + await sleep(); + await page.goto('https://read.mangaplanet.com/browse?page=' + (i + 1)); + + const list = await page.evaluate(() => { + const divs = document.querySelectorAll('.linkbox'); + let array = []; + + divs.forEach(div => { + array.push({ + title: div.querySelector('h3').innerText, + site: 'mangaplanet', + url: div.querySelector('a').href + }); + }); + return array; + }); + newArray = newArray.concat(list); + } + + await browser.close(); + + return newArray; +}; diff --git a/crawler/src/crawlers/mangaplus.js b/crawler/src/crawlers/mangaplus.js new file mode 100644 index 0000000..d103adb --- /dev/null +++ b/crawler/src/crawlers/mangaplus.js @@ -0,0 +1,41 @@ +const puppeteer = require('puppeteer-extra'); +const StealthPlugin = require('puppeteer-extra-plugin-stealth'); +const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); +const scrollPageToBottom = require('puppeteer-autoscroll-down'); + +puppeteer.use(StealthPlugin()); +puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); + +module.exports = async () => { + // until i can figure out how their api works, we will have to do with using puppeteer + // slower, but works + const browser = await puppeteer.launch(); + const page = await browser.newPage(); + await page.goto('https://mangaplus.shueisha.co.jp/manga_list/all'); + + await page.waitForSelector('.AllTitle-module_image_JIEI9', { + visible: true + }); + + await scrollPageToBottom(page); + + const list = await page.evaluate(() => { + let newArray = []; + // this will probably need changing often + const divs = document.querySelectorAll('.AllTitle-module_allTitle_1CIUC'); + + divs.forEach(div => { + newArray.push({ + title: div.querySelector('.AllTitle-module_title_20PzS').innerText, + site: 'mangaplus', + url: 'https://mangaplus.shueisha.co.jp/titles/' + div.querySelector('.AllTitle-module_image_JIEI9').src.split('/title/')[1].split('/')[0].toLowerCase().replaceAll(' ', '-'), + }); + }); + + return newArray; + }); + + await browser.close(); + + return list; +}; diff --git a/crawler/src/crawlers/myanimelist.js b/crawler/src/crawlers/myanimelist.js new file mode 100644 index 0000000..61ffed0 --- /dev/null +++ b/crawler/src/crawlers/myanimelist.js @@ -0,0 +1,48 @@ +const puppeteer = require('puppeteer-extra'); +const StealthPlugin = require('puppeteer-extra-plugin-stealth'); +const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); +const sleep = require('../util/sleep'); + +puppeteer.use(StealthPlugin()); +puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); + +module.exports = async () => { + // MAL api is garbage/non-existant + const browser = await puppeteer.launch({ + headless: false + }); + + // todo: automate this + let currentPages = 10; + let newArray = []; + + const page = await browser.newPage(); + + for (let i = 0; i < currentPages; i++) { + await sleep(); + await page.goto('https://myanimelist.net/store/search?keyword=&p=' + (i + 1)); + + const list = await page.evaluate(() => { + const items = document.querySelectorAll('.item'); + let array = []; + + items.forEach(item => { + try { + array.push({ + title: item.querySelector('.title').innerText, + site: 'myanimelist', + url: item.href + }); + } catch (e) { + // do nothing + } + }); + return array; + }); + newArray = newArray.concat(list); + } + + await browser.close(); + + return newArray; +}; diff --git a/crawler/src/crawlers/netcomics.js b/crawler/src/crawlers/netcomics.js new file mode 100644 index 0000000..1adeda0 --- /dev/null +++ b/crawler/src/crawlers/netcomics.js @@ -0,0 +1,14 @@ +const fetch = require('node-fetch'); + +module.exports = async (ua) => { + const object = await (await fetch('https://beta-api.netcomics.com/api/v1/title/search/text?no=1&size=500000&text=', { headers: { 'user-agent': ua }})).json(); + let newArray = []; + object.data.forEach(item => { + newArray.push({ + title: item.title_name, + site: 'netcomics', + url: 'https://www.netcomics.com/us/comic/' + item.title_name.toLowerCase().replaceAll(' ', '-').replace(/[^a-zA-Z ]/g, '') + }); + }); + return newArray; +}; diff --git a/crawler/src/crawlers/renta.js b/crawler/src/crawlers/renta.js new file mode 100644 index 0000000..e5b12b9 --- /dev/null +++ b/crawler/src/crawlers/renta.js @@ -0,0 +1,51 @@ +const puppeteer = require('puppeteer-extra'); +const StealthPlugin = require('puppeteer-extra-plugin-stealth'); +const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources'); +const sleep = require('../util/sleep'); + +puppeteer.use(StealthPlugin()); +puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font']))); + +module.exports = async () => { + // renta has no api, unfortunately + const browser = await puppeteer.launch(); + + // todo: automate this + let categories = [{ + name: 'Shonen_Manga', + pages: 3 + },{ + name: 'Shojo_Manga', + pages: 7 + }]; + + let newArray = []; + + const page = await browser.newPage(); + + categories.forEach(category => { + for (let i = 0; i < category.pages; i++) { + await sleep(); + await page.goto(`https://www.ebookrenta.com/renta/sc/frm/list?rsi=c&genm=${category.name}&page=${i + 1}&type=desc&col=info`); + + const list = await page.evaluate(() => { + const titles = document.querySelectorAll('.headlines > a'); + let array = []; + + titles.forEach(title => { + array.push({ + title: title.innerText, + site: 'renta', + url: title.href + }); + }); + return array; + }); + newArray = newArray.concat(list); + } + }); + + await browser.close(); + + return newArray; +}; diff --git a/crawler/src/crawlers/toomics.js b/crawler/src/crawlers/toomics.js new file mode 100644 index 0000000..b31d525 --- /dev/null +++ b/crawler/src/crawlers/toomics.js @@ -0,0 +1,16 @@ +const fetch = require('node-fetch'); +const cheerio = require('cheerio'); + +module.exports = async (ua) => { + const data = await (await fetch('https://toomics.com/en/webtoon/ranking', { headers: { 'user-agent': ua }})).text(); + const $ = cheerio.load(data); + const newArray = []; + $('li > .visual').each((_i, title) => { + newArray.push({ + title: $(title).find('.title').text(), + site: 'toomics', + url: 'https://toomics.com' + $(title).find('a').attr('href') + }); + }); + return newArray; +}; diff --git a/crawler/src/crawlers/viz.js b/crawler/src/crawlers/viz.js new file mode 100644 index 0000000..111122a --- /dev/null +++ b/crawler/src/crawlers/viz.js @@ -0,0 +1,23 @@ +const fetch = require('node-fetch'); + +module.exports = async (ua) => { + const data = await (await fetch('https://www.viz.com/search/series_titles.js', { headers: { 'user-agent': ua }})).text(); + // site uses weird code + const object = JSON.parse(data.split('suggestions = ')[1].replaceAll(';','')); + let newArray = []; + object.forEach(async item => { + const slug = item.title.toLowerCase().replaceAll(' ', '-'); + let url = 'https://www.viz.com/' + slug; + // viz site sucks so we have to manually get the url for some things + if (!item.title.replaceAll(' ', '').match(/^[0-9a-z]+$/)) { + url = '/viz/' + slug; + } + + newArray.push({ + title: item.title, + site: 'viz', + url: url + }); + }); + return newArray; +}; diff --git a/crawler/src/crawlinterval.txt b/crawler/src/crawlinterval.txt new file mode 100644 index 0000000..a6802d4 --- /dev/null +++ b/crawler/src/crawlinterval.txt @@ -0,0 +1 @@ +1628607072198 \ No newline at end of file diff --git a/crawler/src/index.js b/crawler/src/index.js new file mode 100644 index 0000000..4050a68 --- /dev/null +++ b/crawler/src/index.js @@ -0,0 +1,31 @@ +const Interval = require('./util/interval.js'); +const crawl = require('./util/crawl.js'); +const fs = require('fs'); + +if (!fs.existsSync('./data.json')) { + fs.writeFileSync('./data.json', '[]'); + console.info('Created data.json file'); +} + +let data = require('./data.json'); +if (data.length === 0) { + console.info('Crawl started..'); + crawl(log).then(() => { + // refresh data + delete require.cache[require.resolve('./data.json')]; + data = require('./data.json'); + console.info('Crawl finished!'); + }); +} + +// in theory, this should get new data every week. +// chances are, this will break horribly and the whole thing will fail +Interval(() => { + console.info('Crawl started..'); + crawl(log).then(() => { + // refresh data + delete require.cache[require.resolve('./data.json')]; + data = require('./data.json'); + console.info('Crawl finished!'); + }); + }, Number(604800000), 'crawl'); \ No newline at end of file diff --git a/crawler/src/test.js b/crawler/src/test.js new file mode 100644 index 0000000..828eda1 --- /dev/null +++ b/crawler/src/test.js @@ -0,0 +1,7 @@ +const comixology = require('./crawlers/myanimelist.js'); +const fs = require('fs'); + +comixology().then(comixology => { + console.log(comixology); + fs.writeFileSync('./comixology.json', JSON.stringify(comixology)); +}); \ No newline at end of file diff --git a/crawler/src/util/crawl.js b/crawler/src/util/crawl.js new file mode 100644 index 0000000..229439f --- /dev/null +++ b/crawler/src/util/crawl.js @@ -0,0 +1,36 @@ +const fs = require('fs'); + +const viz = require('../crawlers/viz.js'); +const inky = require('../crawlers/inky.js'); +const netcomics = require('../crawlers/netcomics.js'); +const crunchy = require('../crawlers/crunchyroll.js'); +const mangaplus = require('../crawlers/mangaplus.js'); +const azuki = require('../crawlers/azuki.js'); +const bookwalker = require('../crawlers/bookwalker.js'); + +module.exports = async (log) => { + let newArray = []; + // this is my personal edge user agent + const ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36 Edg/92.0.902.62'; + // node-fetch + newArray = newArray.concat(await viz(ua)); + log.info('Crawl Viz done'); + newArray = newArray.concat(await netcomics(ua)); + log.info('Crawl Netcomics done'); + newArray = newArray.concat(await inky(ua)); + log.info('Crawl Inky done'); + + // puppeteer + newArray = newArray.concat(await crunchy()); + log.info('Crawl Crunchyroll done'); + newArray = newArray.concat(await mangaplus()); + log.info('Crawl MangaPlus done'); + newArray = newArray.concat(await azuki()); + log.info('Crawl Azuki done'); + newArray = newArray.concat(await bookwalker()); + log.info('Crawl Bookwalker done'); + + fs.writeFileSync('./data.json', JSON.stringify(newArray)); + log.info('Crawl write file done'); + Promise.resolve('Success'); +}; diff --git a/crawler/src/util/crawlLightNovels.js b/crawler/src/util/crawlLightNovels.js new file mode 100644 index 0000000..c4acef0 --- /dev/null +++ b/crawler/src/util/crawlLightNovels.js @@ -0,0 +1,22 @@ +const fs = require('fs'); + +const jNovel = require('../crawlers/j-novel.js'); +const bookwalker = require('../crawlers/bookwalker.js'); + +const doStuff = async () => { + let newArray = []; + // this is my personal edge user agent + const ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36 Edg/92.0.902.62'; + // node-fetch + newArray = newArray.concat(await jNovel(ua, true)); + console.info('Crawl J-Novel done'); + + // puppeteer + newArray = newArray.concat(await bookwalker(true)); + console.info('Crawl BOOKWALKER done'); + + fs.writeFileSync('./dataTest.json', JSON.stringify(newArray)); + console.info('Crawl write file done'); +} + +doStuff(); diff --git a/crawler/src/util/crawlTest.js b/crawler/src/util/crawlTest.js new file mode 100644 index 0000000..2a0dbf9 --- /dev/null +++ b/crawler/src/util/crawlTest.js @@ -0,0 +1,17 @@ +const fs = require('fs'); + +const jNovel = require('../crawlers/inkr.js'); + +const doStuff = async () => { + let newArray = []; + // this is my personal edge user agent + const ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36 Edg/92.0.902.62'; + // node-fetch + newArray = newArray.concat(await jNovel(ua)); + console.info('Crawl J-Novel done'); + + fs.writeFileSync('./dataTest.json', JSON.stringify(newArray)); + console.info('Crawl write file done'); +} + +doStuff(); diff --git a/crawler/src/util/interval.js b/crawler/src/util/interval.js new file mode 100644 index 0000000..f24f9b7 --- /dev/null +++ b/crawler/src/util/interval.js @@ -0,0 +1,30 @@ +// localStorage -> fs because lazy and I want to reuse Mue code +const localStorage = require('./localStorage.js'); + +// based on https://stackoverflow.com/a/47009962 +module.exports = function(callback, interval, name) { + const key = name + 'interval'; + const timeInMs = localStorage.getItem(key); + + const now = Date.now(); + + const executeCallback = () => { + localStorage.setItem(key, Date.now()); + callback(); + } + + if (timeInMs) { + const delta = now - parseInt(timeInMs); + if (delta > interval) { + setInterval(executeCallback, interval); + } else { + setTimeout(() => { + setInterval(executeCallback, interval); + executeCallback(); + }, interval - delta); + } + } else { + setInterval(executeCallback, interval); + } + localStorage.setItem(key, now); +}; diff --git a/crawler/src/util/localStorage.js b/crawler/src/util/localStorage.js new file mode 100644 index 0000000..49b9c3d --- /dev/null +++ b/crawler/src/util/localStorage.js @@ -0,0 +1,14 @@ +const fs = require('fs'); + +module.exports = class LocalStorage { + static getItem(key) { + if (!fs.existsSync(`./${key}.txt`)) { + return null; + } + return fs.readFileSync(`./${key}.txt`); + } + + static setItem(key, value) { + fs.writeFileSync(`./${key}.txt`, String(value)); + } +}; diff --git a/crawler/src/util/sleep.js b/crawler/src/util/sleep.js new file mode 100644 index 0000000..a556909 --- /dev/null +++ b/crawler/src/util/sleep.js @@ -0,0 +1,4 @@ +module.exports = () => { + // use random + return new Promise(resolve => setTimeout(resolve, Math.floor(Math.random() * (6000 - 3000 + 1) + 3000))); +} \ No newline at end of file diff --git a/package.json b/frontend/package.json similarity index 100% rename from package.json rename to frontend/package.json diff --git a/public/index.html b/frontend/public/index.html similarity index 100% rename from public/index.html rename to frontend/public/index.html diff --git a/public/manifest.json b/frontend/public/manifest.json similarity index 100% rename from public/manifest.json rename to frontend/public/manifest.json diff --git a/src/App.jsx b/frontend/src/App.jsx similarity index 100% rename from src/App.jsx rename to frontend/src/App.jsx diff --git a/src/components/Footer.jsx b/frontend/src/components/Footer.jsx similarity index 100% rename from src/components/Footer.jsx rename to frontend/src/components/Footer.jsx diff --git a/src/components/Navbar.jsx b/frontend/src/components/Navbar.jsx similarity index 100% rename from src/components/Navbar.jsx rename to frontend/src/components/Navbar.jsx diff --git a/src/components/Random.jsx b/frontend/src/components/Random.jsx similarity index 100% rename from src/components/Random.jsx rename to frontend/src/components/Random.jsx diff --git a/src/components/Results.jsx b/frontend/src/components/Results.jsx similarity index 100% rename from src/components/Results.jsx rename to frontend/src/components/Results.jsx diff --git a/src/index.js b/frontend/src/index.js similarity index 100% rename from src/index.js rename to frontend/src/index.js diff --git a/src/modules/constants.js b/frontend/src/modules/constants.js similarity index 100% rename from src/modules/constants.js rename to frontend/src/modules/constants.js diff --git a/src/modules/placeholders.js b/frontend/src/modules/placeholders.js similarity index 100% rename from src/modules/placeholders.js rename to frontend/src/modules/placeholders.js diff --git a/src/scss/index.scss b/frontend/src/scss/index.scss similarity index 100% rename from src/scss/index.scss rename to frontend/src/scss/index.scss diff --git a/src/scss/modules/_cards.scss b/frontend/src/scss/modules/_cards.scss similarity index 100% rename from src/scss/modules/_cards.scss rename to frontend/src/scss/modules/_cards.scss diff --git a/src/scss/modules/_dark.scss b/frontend/src/scss/modules/_dark.scss similarity index 100% rename from src/scss/modules/_dark.scss rename to frontend/src/scss/modules/_dark.scss diff --git a/src/scss/modules/_dropdown.scss b/frontend/src/scss/modules/_dropdown.scss similarity index 100% rename from src/scss/modules/_dropdown.scss rename to frontend/src/scss/modules/_dropdown.scss diff --git a/src/scss/modules/_navbar.scss b/frontend/src/scss/modules/_navbar.scss similarity index 100% rename from src/scss/modules/_navbar.scss rename to frontend/src/scss/modules/_navbar.scss diff --git a/src/service-worker.js b/frontend/src/service-worker.js similarity index 100% rename from src/service-worker.js rename to frontend/src/service-worker.js diff --git a/src/serviceWorkerRegistration.js b/frontend/src/serviceWorkerRegistration.js similarity index 100% rename from src/serviceWorkerRegistration.js rename to frontend/src/serviceWorkerRegistration.js