-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawl.js
72 lines (59 loc) · 1.94 KB
/
crawl.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
const Crawler = require("crawler");
const Seenreq = require('seenreq');
const PAGE = process.env.CRAWL_PAGE;
const LINKSELECTOR = process.env.CRAWL_LINKSELECTOR;
const seen = new Seenreq();
var crawledTotal = 0;
var handleCrawlResult = function handleCrawlResult (error, res, done) {
var urls = [];
if (error) {
console.log(error);
} else {
var $ = res.$;
var linkUrl, relLinkUrl;
if (typeof $ === "function") {
$(LINKSELECTOR).each(function (index, link) {
linkUrl = $(link).attr('href');
if (linkUrl
&& linkUrl.length
&& (linkUrl.indexOf("http") < 0 || linkUrl.indexOf(PAGE) > -1)
&& linkUrl.indexOf("javascript:") < 0
&& linkUrl.indexOf("mailto:") < 0
&& linkUrl.indexOf("file:") < 0
&& linkUrl.indexOf("tel:") < 0
&& linkUrl.indexOf("fax:") < 0
&& linkUrl.indexOf("?") < 0
&& linkUrl.indexOf("typo3/") < 0
&& linkUrl.indexOf("fileadmin/") < 0) {
relLinkUrl = linkUrl.replace(PAGE,"")
if (relLinkUrl.indexOf("#") > -1) {
relLinkUrl = relLinkUrl.substring(0, relLinkUrl.indexOf("#"));
}
if(!(relLinkUrl.charAt(0)=="/")) {
relLinkUrl = "/"+relLinkUrl;
}
if (relLinkUrl.length > 1 && !seen.exists(PAGE+relLinkUrl)) {
urls.push(PAGE+relLinkUrl);
}
}
});
}
console.log(`[RM Cache Warmer] Scraped: ${res.options.uri.replace(PAGE,"")} - ${$("title").text()} (${urls.length} urls queued, queue size ${c.queueSize}, total crawled ${crawledTotal++})`);
}
if (urls.length) c.queue(urls);
done();
};
const options = {
auth: {
user: process.env.CRAWL_USER,
pass: process.env.CRAWL_PASSWORD
},
rateLimit: 300,
callback: handleCrawlResult
};
console.log(`[RM Cache Warmer] Scraping: ${PAGE}/`);
const c = new Crawler(options);
c.queue(PAGE+"/");
c.on('drain',function (){
process.exit(0);
});