Skip to content

Commit

Permalink
make crawler opensource and convert to monorepo
Browse files Browse the repository at this point in the history
  • Loading branch information
davidcralph committed Jan 7, 2023
1 parent 767ac13 commit 616b43b
Show file tree
Hide file tree
Showing 56 changed files with 902 additions and 3 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
# Folders
build/
.vercel/
.vscode/
.history/
node_modules/

# Files
yarn.lock
data.json
dataTest.json
pnpm-lock.yaml
package-lock.json
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
As of the latest update, support for manhwa and manhua has been added. You can choose between manga, light novels, manhwa or manhua from the dropdown on the main page.

## About
I built Manga Pages to solve the issue of finding which legal service provides a manga/light novel. Search engines don't always show this information (and often show lots of ad-filled unofficial sites). The site provides a simple hub to finding what you want to read next showing which sites have the manga/light novel available and a MyAnimeList button to read information about it. Think of this service as like [because.moe](https://because.moe) but for manga/light novels. The source code for the backend can be found
[here](https://github.com/davidcralph/mangapages-backend).

I built Manga Pages to solve the issue of finding which legal service provides a manga/light novel. Search engines don't always show this information (and often show lots of ad-filled unofficial sites). The site provides a simple hub to finding what you want to read next showing which sites have the manga/light novel available and a MyAnimeList button to read information about it. Think of this service as like [because.moe](https://because.moe) but for manga/light novels.
## Features
🔍 Fast search with support for [multiple websites](#supported-websites)

Expand Down
15 changes: 15 additions & 0 deletions backend/api/404.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
const rateLimit = require('../struct/ratelimiter');

module.exports = async (req, res) => {
try {
await rateLimit(30, req.headers['x-real-ip']);
} catch (error) {
return res.status(429).send({
message: 'Too many requests'
});
}

return res.status(404).send({
message: 'Not found'
});
};
14 changes: 14 additions & 0 deletions backend/api/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
const config = require('../config.json');
const rateLimit = require('../struct/ratelimiter');

module.exports = async (req, res) => {
try {
await rateLimit(30, req.headers['x-real-ip']);
} catch (error) {
return res.status(429).send({
message: 'Too many requests'
});
}

return res.status(200).send(config.helloworld);
};
26 changes: 26 additions & 0 deletions backend/api/mal.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
const fetch = require('node-fetch');
const rateLimit = require('../struct/ratelimiter');

module.exports = async (req, res) => {
try {
await rateLimit(50, req.headers['x-real-ip']);
} catch (error) {
return res.status(429).send({
message: 'Too many requests',
});
}

if (!req.query.slug) {
return res.status(401).send({
message: 'Input param required',
});
}

const data = await (
await fetch(
`https://myanimelist.net/search/prefix.json?type=manga&keyword=${req.query.slug}`
)
).json();

return res.redirect(data.categories[0].items[0].url, 302);
};
33 changes: 33 additions & 0 deletions backend/api/random.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
const data = require('../data.json');
const dataLightNovel = require('../dataLightNovel.json');
const dataManhwa = require('../dataManhwa.json');
const dataManhua = require('../dataManhua.json');
const rateLimit = require('../struct/ratelimiter');

module.exports = async (req, res) => {
try {
await rateLimit(100, req.headers['x-real-ip']);
} catch (error) {
return res.status(429).send({
message: 'Too many requests',
});
}

let use = data;
switch (req.query.type) {
case 'lightnovel':
use = dataLightNovel;
break;
case 'manhwa':
use = dataManhwa;
break;
case 'manhua':
use = dataManhua;
break;
default:
break;
}

res.setHeader('Access-Control-Allow-Origin', '*');
return res.status(200).send(use.sort(() => Math.random() - 0.5).slice(-4));
};
39 changes: 39 additions & 0 deletions backend/api/search.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
const data = require('../data.json');
const dataLightNovel = require('../dataLightNovel.json');
const dataManhwa = require('../dataManhwa.json');
const dataManhua = require('../dataManhua.json');
const { matchSorter } = require('match-sorter');
const rateLimit = require('../struct/ratelimiter');

module.exports = async (req, res) => {
try {
await rateLimit(500, req.headers['x-real-ip']);
} catch (error) {
return res.status(429).send({
message: 'Too many requests'
});
}

let use = data;
switch (req.query.type) {
case 'lightnovel':
use = dataLightNovel;
break;
case 'manhwa':
use = dataManhwa;
break;
case 'manhua':
use = dataManhua;
break;
default:
break;
}

const mangaResults = req.query.input ? matchSorter(use, req.query.input, {
keys: ['title', 'site'],
threshold: matchSorter.rankings.WORD_STARTS_WITH
}) : { message: 'Input query required' };

res.setHeader('Access-Control-Allow-Origin', '*');
return res.status(200).send(mangaResults.slice(0, 300));
};
7 changes: 7 additions & 0 deletions backend/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"helloworld": {
"version": "1.0.0",
"message": "Hello World!"
},
"ratelimit_time": 60
}
7 changes: 7 additions & 0 deletions backend/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"dependencies": {
"lambda-rate-limiter": "^3.0.1",
"match-sorter": "^6.3.1",
"node-fetch": "^2.6.7"
}
}
6 changes: 6 additions & 0 deletions backend/struct/ratelimiter.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
const config = require('../config.json');
const ratelimit = require('lambda-rate-limiter');

module.exports = ratelimit({
interval: config.ratelimit.time * 1000
}).check;
24 changes: 24 additions & 0 deletions backend/vercel.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"rewrites": [
{
"source": "/",
"destination": "/api/index"
},
{
"source": "/random",
"destination": "/api/random"
},
{
"source": "/search",
"destination": "/api/search"
},
{
"source": "/mal/:slug",
"destination": "/api/mal?slug=:slug"
},
{
"source": "/(.*)",
"destination": "/api/404"
}
]
}
15 changes: 15 additions & 0 deletions crawler/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"scripts": {
"start": "cd src && node index.js"
},
"dependencies": {
"cheerio": "^1.0.0-rc.10",
"esm": "^3.2.25",
"node-fetch-cookies": "^2.0.3",
"puppeteer": "^10.2.0",
"puppeteer-autoscroll-down": "^0.1.7",
"puppeteer-extra": "^3.1.18",
"puppeteer-extra-plugin-block-resources": "^2.2.9",
"puppeteer-extra-plugin-stealth": "^2.7.8"
}
}
11 changes: 11 additions & 0 deletions crawler/src/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"port": 80,
"ratelimit": {
"max": 500,
"per": "1 minute"
},
"log": {
"level": "info",
"prettyPrint": true
}
}
42 changes: 42 additions & 0 deletions crawler/src/crawlers/azuki.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources');
const sleep = require('../util/sleep');

puppeteer.use(StealthPlugin());
puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font'])));

module.exports = async () => {
// azuki has no api, unfortunately
const browser = await puppeteer.launch();

// todo: automate this
let currentPages = 4;
let newArray = [];

const page = await browser.newPage();

for (let i = 0; i < currentPages; i++) {
await sleep();
await page.goto('https://www.azuki.co/series/' + (i + 1));

const list = await page.evaluate(() => {
const titles = document.querySelectorAll('.a-card-link');
let array = [];

titles.forEach(title => {
array.push({
title: title.innerText,
site: 'azuki',
url: 'https://www.azuki.co/series/' + title.innerText.toLowerCase().replaceAll(' ', '-').replace(/[^a-zA-Z ]/g, '')
});
});
return array;
});
newArray = newArray.concat(list);
}

await browser.close();

return newArray;
};
40 changes: 40 additions & 0 deletions crawler/src/crawlers/bilibili.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources');
const scrollPageToBottom = require('puppeteer-autoscroll-down');
const sleep = require('../util/sleep');

puppeteer.use(StealthPlugin());
puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font'])));

module.exports = async () => {
// until i can figure out how their api works, we will have to do with using puppeteer
// slower, but works
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.bilibilicomics.com/genre');

await scrollPageToBottom(page);
await sleep();
await scrollPageToBottom(page);

const list = await page.evaluate(() => {
let newArray = [];
// this will probably need changing often
const divs = document.querySelectorAll('.text-info-section');

divs.forEach(div => {
newArray.push({
title: div.querySelector('.manga-title').innerText,
site: 'bilibili',
url: div.querySelector('a').href,
});
});

return newArray;
});

await browser.close();

return list;
};
48 changes: 48 additions & 0 deletions crawler/src/crawlers/bookwalker.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const BlockResourcesPlugin = require('puppeteer-extra-plugin-block-resources');
const sleep = require('../util/sleep');

puppeteer.use(StealthPlugin());
puppeteer.use(BlockResourcesPlugin(new Set(['image', 'stylesheet', 'script', 'font'])));

module.exports = async (novel) => {
// bookwalker also has no api, unfortunately
// it also has 40 pages! do you have any idea how long it takes to load all this?
const browser = await puppeteer.launch();

// todo: automate this
let currentPages = 40;
let newArray = [];

let baseUrl = 'https://global.bookwalker.jp/categories/2/?np=0&page=';
if (novel === true) {
baseUrl = 'https://global.bookwalker.jp/categories/3/?np=0&page=';
currentPages = 7;
}

const page = await browser.newPage();
for (let i = 0; i < currentPages; i++) {
await sleep();
await page.goto(baseUrl + (i + 1));

const list = await page.evaluate(() => {
const titles = document.querySelectorAll('.a-tile-ttl a');
let array = [];

titles.forEach(title => {
array.push({
title: title.innerText,
site: 'bookwalker',
url: title.href
});
});
return array;
});
newArray = newArray.concat(list);
}

await browser.close();

return newArray;
};
Loading

0 comments on commit 616b43b

Please sign in to comment.