Skip to content

Commit

Permalink
first
Browse files Browse the repository at this point in the history
  • Loading branch information
echoVic committed Mar 9, 2017
1 parent 75c9a84 commit f515dd8
Show file tree
Hide file tree
Showing 15 changed files with 845 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
node_modules/
.idea
58 changes: 58 additions & 0 deletions app.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
var path = require('path');
var express = require('express');
var read = require('./web/read');
var config = require('./config');
var spawn = require('child_process').spawn;
var cronJob = require('cron').CronJob;

var app = express();

// 配置 express
app.set('views', __dirname + '/views');
app.set('view engine', 'ejs');
app.use('/public', express.static(path.join(__dirname, 'public')));

// 网站首页
app.get('/', function (req, res, next) {
// articleListByClassId 的第一个参数是文章分类的 ID
// 第二个参数是返回结果的开始位置
// 第三个参数是返回结果的数量
read.articleListByClassId(0, 0, 20, function (err, list) {
if (err) return next(err);

// 渲染模版
res.locals.articleList = list;
res.render('index');
});
});

// 文章页面
app.get('/article/:id', function (req, res, next) {
// 通过req.params.id 来取得 URL 中 :id 部分的参数
read.article(req.params.id, function (err, article) {
if (err) return next(err);

// 渲染模版
res.locals.article = article;
res.render('article');
});
});

app.listen(config.port);
console.log('服务器已启动');

// 定时执行更新任务
var job = new cronJob(config.autoUpdate,function () {
console.log('开始执行定时更新任务');
var update = spawn(process.execPath, [path.resolve(__dirname, 'update/all.js')]);
update.stdout.pipe(process.stdout);
update.stderr.pipe(process.stderr);
update.on('close',function (code) {
console.log('更新任务结束,代码=%d',code);
});
});
job.start();

process.on('uncaughtException',function (err) {
console.error('uncaughtException:%s',err.stack);
});
26 changes: 26 additions & 0 deletions config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// MySQL数据库连接配置
var mysql = require('mysql');
exports.db = mysql.createConnection({
host: '127.0.0.1',
port: 3306,
database: 'sina_blog',
user: 'root',
password: '123456789'
});

// 博客配置
exports.sinaBlog = {
url:'http://blog.sina.com.cn/u/1776757314' // 博客首页地址
};

// Web服务器端口
exports.port = 3000;

// 定时更新
//f1 f2 f3 f4 f5 f5 f6
//其中 f1 表示秒钟,f2 表示分钟,f3 表示小时,f4 表示一个月份中的第几//日,f5 表示月 份,f6 表示一个星期中的第几天。各部分的取值含义如下(//以 f1 部分为例,其他部分类 似):
//当值为 * 时,表示每秒执行一次;
//当值为 a-b 时,表示从第 a 秒钟到第 b 秒钟这段时间内执行一次;
//当值为 */n 时,表示每隔 n 秒钟执行一次;
//当值为 a-b/n 时,表示从第 a 秒钟到第 b 秒钟这段时间内每隔 n //秒钟执行一次。
exports.autoUpdate = '* */30 * * *';
20 changes: 20 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"name": "blog-spider",
"main": "app.js",
"version": "0.0.1",
"private": true,
"description": "网络爬虫与数据库操作",
"engines": {
"node": ">=0.10.0"
},
"dependencies": {
"debug": "^0.7.2",
"cheerio": "^0.12.3",
"request": "^2.27.0",
"async": "^0.2.9",
"mysql": "2.x",
"express": "^3.4.0",
"ejs": "^0.8.4",
"cron": "^1.0.1"
}
}
94 changes: 94 additions & 0 deletions update/all.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
var async = require('async');
var config = require('../config');
var read = require('./read');
var save = require('./save');
var debug = require('debug')('spider-blog:update:all');

var classList;
var articleList = {};

async.series([

// 获取文章分类列表
function (done) {
read.classList(config.sinaBlog.url, function (err, list) {
classList = list;
done(err);
});
},

// 保存文章分类
function (done) {
save.classList(classList, done)
},

// 依次获取所有文章分类下的文章列表
function (done) {
async.eachSeries(classList, function (c, next) {
read.articleList(c.url, function (err, list) {
articleList[c.id] = list;
next(err);
});
}, done);
},

// 保存文章列表
function (done) {
async.eachSeries(Object.keys(articleList), function (classId, next) {
save.articleList(classId, articleList[classId], next);
}, done);
},

// 保存文章数量
function (done) {
async.eachSeries(Object.keys(articleList), function (classId, next) {
save.articleCount(classId, articleList[classId].length, next);
}, done);
},

// 重新整理文章列表,把重复的文章去掉
function (done) {
debug('整理文章列表,把重复的文章去掉');

var articles = {};
Object.keys(articleList).forEach(function (classId) {
articleList[classId].forEach(function (item) {
articles[item.id] = item;
});
});

articleList = [];
Object.keys(articles).forEach(function (id) {
articleList.push(articles[id]);
});

done();
},

// 依次读取文章的详细内容,并保存
function (done) {
async.eachSeries(articleList, function (item, next) {
save.isAericleExists(item.id, function (err, exists) {
if (err) return next(err);

if (exists) {
debug('文章已存在:%s', item.url);
return next();
}

read.articleDetail(item.url, function (err, ret) {
if (err) return next(err);
save.articleDetail(item.id, ret.tags, ret.content, function (err) {
if (err) return next(err);
save.articleTags(item.id, ret.tags, next);
});
});
});
}, done);
}
], function (err) {
if (err) console.error(err.stack);

console.log('完成');
process.exit(0);
});
103 changes: 103 additions & 0 deletions update/article_all.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
var request = require('request');
var cheerio = require('cheerio');
var async = require('async');
var debug = require('debug')('spider-blog:update');

/*
* 获取分类页面博文列表
*
* @param {String} url
* @param {Function} callback
* */
function readArticleList(url, callback) {
debug('读取博文列表:%s', url);

request(url, function (err, res) {
if (err) return callback(err);

// 根据网页内容创建DOM操作对象
var $ = cheerio.load(res.body.toString());

// 读取博文列表
var articleList = [];
$('.articleList .articleCell').each(function () {
var $me = $(this);
var $title = $me.find('.atc_title a');
var $time = $me.find('.atc_tm');
var item = {
title: $title.text().trim(),
url: $title.attr('href'),
time: $time.text().trim()
};
// 从URL中取出文章的ID
var s = item.url.match(/blog_([a-zA-Z0-9]+)\.html/);
if (Array.isArray(s)) {
item.id = s[1];
articleList.push(item);
}
});

// 返回结果
callback(null, articleList);
});
}

/*
* 获取博文页面内容
*
* @param {String} url
* @param {Function} callback
* */
function readArticleDetail(url, callback) {
debug('读取博文内容:%s',url);

request(url,function (err, res) {
if (err) return callback(err);

// 根据网页内容创建DOM操作对象
var $ = cheerio.load(res.body.toString());

// 获取文章标签
var tags = [];
$('.blog_tag h3 a').each(function () {
var tag = $(this).text().trim();
if (tag) {
tags.push(tag);
}
});

// 获取文章内容
var content = $('.articalContent').html().trim();

// 返回结果
callback(null,{tags:tags,content:content});
});
}

// 读取分类下的所有文章
readArticleList('http://blog.sina.com.cn/s/articlelist_1776757314_0_1.html',function (err, articleList) {
if (err) return console.error(err.stack);

// 依次取出 articleList 数组的每个元素,调用第二个参数中传入的函数
// 函数的第一个参数即是 articleList 数组的其中一个元素
// 函数的第二个参数是回调函数
async.eachSeries(articleList,function (article, next) {

// 读取文章内容
readArticleDetail(article.url,function (err, detail) {
if (err) console.log(err.stack);

// 直接显示
console.log(detail);

// 需要调用 next() 来返回
next();
});
},function (err) {
// 当遍历完 articleList 后,执行此回调函数

if (err) return console.error(err.stack);

console.log('完成');
});
});
28 changes: 28 additions & 0 deletions update/article_detail.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
var request = require('request');
var cheerio = require('cheerio');
var debug = require('debug')('spider-blog:update');

debug('读取博文内容');

// 读取博文页面
request('http://blog.sina.com.cn/s/blog_69e72a420101gvec.html', function (err, res) {
if (err) return callback(err);

// 根据网页内容创建DOM操作对象
var $ = cheerio.load(res.body.toString());

// 获取文章标签
var tags = [];
$('.blog_tag h3 a').each(function () {
var tag = $(this).text().trim();
if (tag) {
tags.push(tag);
}
});

// 获取文章内容
var content = $('.articalContent').html().trim();

// 输出结果
console.log({tags: tags, content: content});
});
54 changes: 54 additions & 0 deletions update/article_list.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
var request = require('request');
var cheerio = require('cheerio');
var debug = require('debug')('spider-blog:update');

debug('读取博文列表');

function readArticleList(url, callback) {
// 读取分类页面
request(url, function (err, res) {
if (err) return console.error(err);

// 根据网页内容创建DOM操作对象
var $ = cheerio.load(res.body.toString());

// 读取博文列表
var articleList = [];
$('.articleList .articleCell').each(function () {
var $me = $(this);
var $title = $me.find('.atc_title a');
var $time = $me.find('.atc_tm');
var item = {
title: $title.text().trim(),
url: $title.attr('href'),
time: $time.text().trim()
};
// 从URL中取出文章的ID
var s = item.url.match(/blog_([a-zA-Z0-9]+)\.html/);
if (Array.isArray(s)) {
item.id = s[1];
articleList.push(item);
}
});

// 检查是否有下一页
var nextUrl = $('.SG_pgnext a').attr('href');
if (nextUrl) {
// 读取下一页
readArticleList(nextUrl, function (err, articleList2) {
if (err) return callback(err);

// 合并结果
callback(null, articleList.concat(articleList2));
});
} else {
// 返回结果
callback(null, articleList);
}
});
}

readArticleList('http://blog.sina.com.cn/s/articlelist_1776757314_0_1.html', function (err, articleList) {
if (err) console.error(err.stack);
console.log(articleList);
});
Loading

0 comments on commit f515dd8

Please sign in to comment.