Skip to content

Commit 8b0ce1a

Browse files
committed
文章以文件形式存储
1 parent 53639c7 commit 8b0ce1a

File tree

5 files changed

+73
-31
lines changed

5 files changed

+73
-31
lines changed

baike_bug/bug_main.py

+21-11
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,16 @@
11
# coding:utf8
2-
from baike_bug import url_manager, html_downloader, html_parser, html_output, db_config, db_util
2+
import sys
3+
4+
from baike_bug import url_manager, html_downloader, html_parser, html_output, db_util
5+
6+
defaultencoding = 'utf-8'
37

48

59
class BugMain(object):
610
def __init__(self):
11+
if sys.getdefaultencoding() != defaultencoding:
12+
reload(sys)
13+
sys.setdefaultencoding(defaultencoding)
714
self.urls = url_manager.UrlManager()
815
self.downloader = html_downloader.HtmlDownloader()
916
self.parser = html_parser.HtmlParser()
@@ -23,29 +30,32 @@ def craw(self, root_url):
2330
if info_data is not None:
2431
sql = "insert into t_novel(novelurl, novelname, clicknum, wordsnum, type, " \
2532
"author, isfinish, biref, imageurl) values " \
26-
"('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" %\
27-
(info_data['info_url'] ,info_data['novelName'], info_data['clickNum'],
28-
info_data['wordsNum'], info_data['type'], info_data['author'], info_data['state'],
29-
info_data['brief'], info_data['imageUrl'])
33+
"('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % \
34+
(info_data['info_url'].encode('utf-8'), info_data['novelName'].encode('utf-8'),
35+
info_data['clickNum'],
36+
info_data['wordsNum'].encode('utf-8'), info_data['type'].encode('utf-8'),
37+
info_data['author'].encode('utf-8'), info_data['state'].encode('utf-8'),
38+
info_data['brief'].encode('utf-8'), info_data['imageUrl'].encode('utf-8'))
3039
self.dbutil.insert(sql)
3140

3241
if chapter_datas is not None:
3342
for chapter_data in chapter_datas:
3443
sql = "insert into t_chapter(novelurl, chaptername, chapterurl, chapternum) values " \
3544
"('%s', '%s', '%s', '%s')" % \
36-
(chapter_data['info_url'], chapter_data['chapterName'], chapter_data['chapterUrl'],
37-
chapter_data['chapterNum'])
45+
(chapter_data['info_url'].encode('utf-8'), chapter_data['chapterName'].encode('utf-8'),
46+
chapter_data['chapterUrl'].encode('utf-8'),
47+
chapter_data['chapterNum'].encode('utf-8'))
3848
self.dbutil.insert(sql)
3949
# if count == 2:
4050
# break
4151
count = count + 1
42-
except:
43-
print 'craw fail'
52+
except Exception, e:
53+
print 'craw fail:' + str(e)
4454
print 'finish'
4555
# 关闭数据库连接
4656
self.dbutil.close()
47-
# self.outputer.output_html()
48-
self.dbutil.db.close()#关闭数据库链接
57+
# self.outputer.output_html()
58+
self.dbutil.db.close() # 关闭数据库链接
4959

5060

5161
if __name__ == "__main__":

baike_bug/db_config.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@
44

55
class DB_Config(object):
66
def connect(self):
7-
conn = MySQLdb.connect(host="localhost", user="root", passwd="123456", db="noveldb", charset="utf8")
7+
conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="12345", db="noveldb", charset="utf8")
88
return conn

baike_bug/db_util.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,16 @@
11
# encoding:utf8
2+
import sys
3+
24
from baike_bug import db_config
35

6+
defaultencoding = 'utf-8'
7+
48

59
class DB_Util(object):
610
def __init__(self):
11+
if sys.getdefaultencoding() != defaultencoding:
12+
reload(sys)
13+
sys.setdefaultencoding(defaultencoding)
714
self.db = db_config.DB_Config().connect()
815
self.cursor = self.db.cursor()
916

@@ -13,8 +20,9 @@ def insert(self, sql):
1320
self.cursor.execute(sql)
1421
# 提交到数据库执行
1522
self.db.commit()
16-
except:
23+
except Exception, e:
1724
# 发生错误时回滚
25+
print "insert fail:" + e
1826
self.db.rollback()
1927

2028
def query(self, sql):
@@ -38,6 +46,3 @@ def query(self, sql):
3846
(fname, lname, age, sex, income)
3947
except:
4048
print "Error: unable to fecth data"
41-
42-
43-

baike_bug/file_output.py

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# coding:utf8
2+
import os
3+
4+
5+
class FileOutPut(object):
6+
def __init__(self):
7+
self.data = {}
8+
9+
# 参数 文件数据,文件夹名称,文件名称
10+
def file_output(self, data, dirname, filename):
11+
filepath = r'../novelfile/%s/' % dirname
12+
if os.path.exists(filepath) is False:
13+
os.mkdir(filepath)
14+
try:
15+
filehandle = open(filepath + filename + ".txt", "w")
16+
filehandle.write(data.encode("utf-8"))
17+
filehandle.close()
18+
except Exception, e:
19+
print e
20+
return filepath + filename + ".txt"

baike_bug/html_parser.py

+22-15
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,17 @@
22
import re
33
import urlparse
44

5+
from baike_bug import html_downloader, db_util
56
from bs4 import BeautifulSoup
67

7-
from baike_bug import html_downloader, db_util
8+
from baike_bug import file_output
89

910

1011
class HtmlParser(object):
1112
def __init__(self):
1213
self.downloader = html_downloader.HtmlDownloader()
1314
self.dbutil = db_util.DB_Util()
15+
self.outputFile = file_output.FileOutPut()
1416

1517
# 本地方法需要先定义在使用,也就是定义得放在前面不然无法调用
1618
def _get_new_urls(self, page_url, soup):
@@ -41,20 +43,20 @@ def _get_new_data(self, page_url, soup):
4143

4244
# 文章数据解析
4345

44-
def _parse_chapter(self, chapter_url):
46+
def _parse_chapter(self, chapter_url, info_url):
4547
if chapter_url is None:
4648
return
4749
html_cont = self.downloader.download(chapter_url)
4850
if html_cont is None:
4951
return
5052
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
5153
# new_urls = self._get_new_urls(chapter_url, soup) # self调本地方法,通过url
52-
content_data = self._parse_content_data(chapter_url, soup)
54+
content_data = self._parse_content_data(info_url, chapter_url, soup)
5355

5456
if content_data is not None:
5557
sql = "insert into t_content(chapterurl, content, nexturl, preurl) values " \
5658
"('%s', '%s', '%s', '%s')" % \
57-
(content_data['chapterUrl'], content_data['content'], content_data['nextUrl'],
59+
(content_data['chapterUrl'], content_data['content'].encode('utf-8'), content_data['nextUrl'],
5860
content_data['preUrl'])
5961
self.dbutil.insert(sql)
6062

@@ -110,35 +112,40 @@ def _parse_chapter_data(self, page_url, soup):
110112
chapter_list = []
111113
if page_url.find("info") == -1:
112114
return None
113-
count = 0
115+
count = 1
114116
# 匹配title <dd class="lemmaWgt-lemmaTitle-title">
115117
chapter_list_node = soup.find('div', class_="volume-wrap")
116118
for chapter in chapter_list_node.find_all('li'):
119+
if count > 1:
120+
return chapter_list
117121
chapter_data = {}
118122
chapter_data['info_url'] = page_url
119123
chapter_data['chapterNum'] = count
120124
chapter_data['chapterName'] = chapter.get_text()
121125
chapter_data['chapterUrl'] = chapter.find('a').get('href')
122126
# join 方法会按照pageurl的格式将new_url补全
123127
chapter_data['chapterUrl'] = urlparse.urljoin(page_url, chapter_data['chapterUrl'])
128+
count = count + 1
124129
try:
125130
if chapter.find('a').get('href') is not None:
126-
self._parse_chapter(chapter_data['chapterUrl']) # 解析内容数据
127-
except:
128-
print "parse_chapter() fail "
131+
self._parse_chapter(chapter_data['chapterUrl'], page_url) # 解析内容数据
132+
except Exception, e:
133+
print "parse_chapter() fail:" + e
129134
chapter_list.append(chapter_data)
130-
return chapter_data
135+
return chapter_list
131136

132137
# 内容信息解析 文章内容,章节url,上一章节url,下一章节url
133-
def _parse_content_data(self, page_url, soup):
138+
def _parse_content_data(self, info_url, chapter_url, soup):
134139
# self._get_new_urls(page_url, soup)
135140
content_data = {}
136-
if page_url.find("chapter") == -1:
141+
if chapter_url.find("chapter") == -1:
137142
return None
138-
content_data['chapterUrl'] = page_url
143+
content_data['chapterUrl'] = chapter_url
139144
content_node = soup.find('div', class_="read-content j_readContent")
140145
# content_data['content'] = content_node.get_text()
141-
content_data['content'] = "hahah"
146+
filepath = self.outputFile.file_output(content_node.get_text(), info_url.split("/").pop(),
147+
chapter_url.split("/").pop())
148+
content_data['content'] = filepath
142149
# p_nodes = content_node.find_all('p')
143150
# for content in p_nodes:
144151
# content_data['content'] = content_data['content'] + '\\n' + content.get_text()
@@ -148,10 +155,10 @@ def _parse_content_data(self, page_url, soup):
148155
chapter_node = soup.find('div', class_="chapter-control dib-wrap")
149156
for a in chapter_node.find_all('a'):
150157
if a.get('id') is not None and a.get('id').find("j_chapterPrev") != -1:
151-
content_data['preUrl'] = a.get('href')
158+
content_data['preUrl'] = urlparse.urljoin(chapter_url, a.get('href'))
152159
continue
153160
if a.get('id') is not None and a.get('id').find("j_chapterNext") != -1:
154-
content_data['nextUrl'] = a.get('href')
161+
content_data['nextUrl'] = urlparse.urljoin(chapter_url, a.get('href'))
155162
continue
156163
return content_data
157164

0 commit comments

Comments
 (0)