-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
845 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
node_modules/ | ||
.idea |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
var path = require('path'); | ||
var express = require('express'); | ||
var read = require('./web/read'); | ||
var config = require('./config'); | ||
var spawn = require('child_process').spawn; | ||
var cronJob = require('cron').CronJob; | ||
|
||
var app = express(); | ||
|
||
// 配置 express | ||
app.set('views', __dirname + '/views'); | ||
app.set('view engine', 'ejs'); | ||
app.use('/public', express.static(path.join(__dirname, 'public'))); | ||
|
||
// 网站首页 | ||
app.get('/', function (req, res, next) { | ||
// articleListByClassId 的第一个参数是文章分类的 ID | ||
// 第二个参数是返回结果的开始位置 | ||
// 第三个参数是返回结果的数量 | ||
read.articleListByClassId(0, 0, 20, function (err, list) { | ||
if (err) return next(err); | ||
|
||
// 渲染模版 | ||
res.locals.articleList = list; | ||
res.render('index'); | ||
}); | ||
}); | ||
|
||
// 文章页面 | ||
app.get('/article/:id', function (req, res, next) { | ||
// 通过req.params.id 来取得 URL 中 :id 部分的参数 | ||
read.article(req.params.id, function (err, article) { | ||
if (err) return next(err); | ||
|
||
// 渲染模版 | ||
res.locals.article = article; | ||
res.render('article'); | ||
}); | ||
}); | ||
|
||
app.listen(config.port); | ||
console.log('服务器已启动'); | ||
|
||
// 定时执行更新任务 | ||
var job = new cronJob(config.autoUpdate,function () { | ||
console.log('开始执行定时更新任务'); | ||
var update = spawn(process.execPath, [path.resolve(__dirname, 'update/all.js')]); | ||
update.stdout.pipe(process.stdout); | ||
update.stderr.pipe(process.stderr); | ||
update.on('close',function (code) { | ||
console.log('更新任务结束,代码=%d',code); | ||
}); | ||
}); | ||
job.start(); | ||
|
||
process.on('uncaughtException',function (err) { | ||
console.error('uncaughtException:%s',err.stack); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
// MySQL数据库连接配置 | ||
var mysql = require('mysql'); | ||
exports.db = mysql.createConnection({ | ||
host: '127.0.0.1', | ||
port: 3306, | ||
database: 'sina_blog', | ||
user: 'root', | ||
password: '123456789' | ||
}); | ||
|
||
// 博客配置 | ||
exports.sinaBlog = { | ||
url:'http://blog.sina.com.cn/u/1776757314' // 博客首页地址 | ||
}; | ||
|
||
// Web服务器端口 | ||
exports.port = 3000; | ||
|
||
// 定时更新 | ||
//f1 f2 f3 f4 f5 f5 f6 | ||
//其中 f1 表示秒钟,f2 表示分钟,f3 表示小时,f4 表示一个月份中的第几//日,f5 表示月 份,f6 表示一个星期中的第几天。各部分的取值含义如下(//以 f1 部分为例,其他部分类 似): | ||
//当值为 * 时,表示每秒执行一次; | ||
//当值为 a-b 时,表示从第 a 秒钟到第 b 秒钟这段时间内执行一次; | ||
//当值为 */n 时,表示每隔 n 秒钟执行一次; | ||
//当值为 a-b/n 时,表示从第 a 秒钟到第 b 秒钟这段时间内每隔 n //秒钟执行一次。 | ||
exports.autoUpdate = '* */30 * * *'; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
{ | ||
"name": "blog-spider", | ||
"main": "app.js", | ||
"version": "0.0.1", | ||
"private": true, | ||
"description": "网络爬虫与数据库操作", | ||
"engines": { | ||
"node": ">=0.10.0" | ||
}, | ||
"dependencies": { | ||
"debug": "^0.7.2", | ||
"cheerio": "^0.12.3", | ||
"request": "^2.27.0", | ||
"async": "^0.2.9", | ||
"mysql": "2.x", | ||
"express": "^3.4.0", | ||
"ejs": "^0.8.4", | ||
"cron": "^1.0.1" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
var async = require('async'); | ||
var config = require('../config'); | ||
var read = require('./read'); | ||
var save = require('./save'); | ||
var debug = require('debug')('spider-blog:update:all'); | ||
|
||
var classList; | ||
var articleList = {}; | ||
|
||
async.series([ | ||
|
||
// 获取文章分类列表 | ||
function (done) { | ||
read.classList(config.sinaBlog.url, function (err, list) { | ||
classList = list; | ||
done(err); | ||
}); | ||
}, | ||
|
||
// 保存文章分类 | ||
function (done) { | ||
save.classList(classList, done) | ||
}, | ||
|
||
// 依次获取所有文章分类下的文章列表 | ||
function (done) { | ||
async.eachSeries(classList, function (c, next) { | ||
read.articleList(c.url, function (err, list) { | ||
articleList[c.id] = list; | ||
next(err); | ||
}); | ||
}, done); | ||
}, | ||
|
||
// 保存文章列表 | ||
function (done) { | ||
async.eachSeries(Object.keys(articleList), function (classId, next) { | ||
save.articleList(classId, articleList[classId], next); | ||
}, done); | ||
}, | ||
|
||
// 保存文章数量 | ||
function (done) { | ||
async.eachSeries(Object.keys(articleList), function (classId, next) { | ||
save.articleCount(classId, articleList[classId].length, next); | ||
}, done); | ||
}, | ||
|
||
// 重新整理文章列表,把重复的文章去掉 | ||
function (done) { | ||
debug('整理文章列表,把重复的文章去掉'); | ||
|
||
var articles = {}; | ||
Object.keys(articleList).forEach(function (classId) { | ||
articleList[classId].forEach(function (item) { | ||
articles[item.id] = item; | ||
}); | ||
}); | ||
|
||
articleList = []; | ||
Object.keys(articles).forEach(function (id) { | ||
articleList.push(articles[id]); | ||
}); | ||
|
||
done(); | ||
}, | ||
|
||
// 依次读取文章的详细内容,并保存 | ||
function (done) { | ||
async.eachSeries(articleList, function (item, next) { | ||
save.isAericleExists(item.id, function (err, exists) { | ||
if (err) return next(err); | ||
|
||
if (exists) { | ||
debug('文章已存在:%s', item.url); | ||
return next(); | ||
} | ||
|
||
read.articleDetail(item.url, function (err, ret) { | ||
if (err) return next(err); | ||
save.articleDetail(item.id, ret.tags, ret.content, function (err) { | ||
if (err) return next(err); | ||
save.articleTags(item.id, ret.tags, next); | ||
}); | ||
}); | ||
}); | ||
}, done); | ||
} | ||
], function (err) { | ||
if (err) console.error(err.stack); | ||
|
||
console.log('完成'); | ||
process.exit(0); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
var request = require('request'); | ||
var cheerio = require('cheerio'); | ||
var async = require('async'); | ||
var debug = require('debug')('spider-blog:update'); | ||
|
||
/* | ||
* 获取分类页面博文列表 | ||
* | ||
* @param {String} url | ||
* @param {Function} callback | ||
* */ | ||
function readArticleList(url, callback) { | ||
debug('读取博文列表:%s', url); | ||
|
||
request(url, function (err, res) { | ||
if (err) return callback(err); | ||
|
||
// 根据网页内容创建DOM操作对象 | ||
var $ = cheerio.load(res.body.toString()); | ||
|
||
// 读取博文列表 | ||
var articleList = []; | ||
$('.articleList .articleCell').each(function () { | ||
var $me = $(this); | ||
var $title = $me.find('.atc_title a'); | ||
var $time = $me.find('.atc_tm'); | ||
var item = { | ||
title: $title.text().trim(), | ||
url: $title.attr('href'), | ||
time: $time.text().trim() | ||
}; | ||
// 从URL中取出文章的ID | ||
var s = item.url.match(/blog_([a-zA-Z0-9]+)\.html/); | ||
if (Array.isArray(s)) { | ||
item.id = s[1]; | ||
articleList.push(item); | ||
} | ||
}); | ||
|
||
// 返回结果 | ||
callback(null, articleList); | ||
}); | ||
} | ||
|
||
/* | ||
* 获取博文页面内容 | ||
* | ||
* @param {String} url | ||
* @param {Function} callback | ||
* */ | ||
function readArticleDetail(url, callback) { | ||
debug('读取博文内容:%s',url); | ||
|
||
request(url,function (err, res) { | ||
if (err) return callback(err); | ||
|
||
// 根据网页内容创建DOM操作对象 | ||
var $ = cheerio.load(res.body.toString()); | ||
|
||
// 获取文章标签 | ||
var tags = []; | ||
$('.blog_tag h3 a').each(function () { | ||
var tag = $(this).text().trim(); | ||
if (tag) { | ||
tags.push(tag); | ||
} | ||
}); | ||
|
||
// 获取文章内容 | ||
var content = $('.articalContent').html().trim(); | ||
|
||
// 返回结果 | ||
callback(null,{tags:tags,content:content}); | ||
}); | ||
} | ||
|
||
// 读取分类下的所有文章 | ||
readArticleList('http://blog.sina.com.cn/s/articlelist_1776757314_0_1.html',function (err, articleList) { | ||
if (err) return console.error(err.stack); | ||
|
||
// 依次取出 articleList 数组的每个元素,调用第二个参数中传入的函数 | ||
// 函数的第一个参数即是 articleList 数组的其中一个元素 | ||
// 函数的第二个参数是回调函数 | ||
async.eachSeries(articleList,function (article, next) { | ||
|
||
// 读取文章内容 | ||
readArticleDetail(article.url,function (err, detail) { | ||
if (err) console.log(err.stack); | ||
|
||
// 直接显示 | ||
console.log(detail); | ||
|
||
// 需要调用 next() 来返回 | ||
next(); | ||
}); | ||
},function (err) { | ||
// 当遍历完 articleList 后,执行此回调函数 | ||
|
||
if (err) return console.error(err.stack); | ||
|
||
console.log('完成'); | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
var request = require('request'); | ||
var cheerio = require('cheerio'); | ||
var debug = require('debug')('spider-blog:update'); | ||
|
||
debug('读取博文内容'); | ||
|
||
// 读取博文页面 | ||
request('http://blog.sina.com.cn/s/blog_69e72a420101gvec.html', function (err, res) { | ||
if (err) return callback(err); | ||
|
||
// 根据网页内容创建DOM操作对象 | ||
var $ = cheerio.load(res.body.toString()); | ||
|
||
// 获取文章标签 | ||
var tags = []; | ||
$('.blog_tag h3 a').each(function () { | ||
var tag = $(this).text().trim(); | ||
if (tag) { | ||
tags.push(tag); | ||
} | ||
}); | ||
|
||
// 获取文章内容 | ||
var content = $('.articalContent').html().trim(); | ||
|
||
// 输出结果 | ||
console.log({tags: tags, content: content}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
var request = require('request'); | ||
var cheerio = require('cheerio'); | ||
var debug = require('debug')('spider-blog:update'); | ||
|
||
debug('读取博文列表'); | ||
|
||
function readArticleList(url, callback) { | ||
// 读取分类页面 | ||
request(url, function (err, res) { | ||
if (err) return console.error(err); | ||
|
||
// 根据网页内容创建DOM操作对象 | ||
var $ = cheerio.load(res.body.toString()); | ||
|
||
// 读取博文列表 | ||
var articleList = []; | ||
$('.articleList .articleCell').each(function () { | ||
var $me = $(this); | ||
var $title = $me.find('.atc_title a'); | ||
var $time = $me.find('.atc_tm'); | ||
var item = { | ||
title: $title.text().trim(), | ||
url: $title.attr('href'), | ||
time: $time.text().trim() | ||
}; | ||
// 从URL中取出文章的ID | ||
var s = item.url.match(/blog_([a-zA-Z0-9]+)\.html/); | ||
if (Array.isArray(s)) { | ||
item.id = s[1]; | ||
articleList.push(item); | ||
} | ||
}); | ||
|
||
// 检查是否有下一页 | ||
var nextUrl = $('.SG_pgnext a').attr('href'); | ||
if (nextUrl) { | ||
// 读取下一页 | ||
readArticleList(nextUrl, function (err, articleList2) { | ||
if (err) return callback(err); | ||
|
||
// 合并结果 | ||
callback(null, articleList.concat(articleList2)); | ||
}); | ||
} else { | ||
// 返回结果 | ||
callback(null, articleList); | ||
} | ||
}); | ||
} | ||
|
||
readArticleList('http://blog.sina.com.cn/s/articlelist_1776757314_0_1.html', function (err, articleList) { | ||
if (err) console.error(err.stack); | ||
console.log(articleList); | ||
}); |
Oops, something went wrong.