2
2
import re
3
3
import urlparse
4
4
5
+ from baike_bug import html_downloader , db_util
5
6
from bs4 import BeautifulSoup
6
7
7
- from baike_bug import html_downloader , db_util
8
+ from baike_bug import file_output
8
9
9
10
10
11
class HtmlParser (object ):
11
12
def __init__ (self ):
12
13
self .downloader = html_downloader .HtmlDownloader ()
13
14
self .dbutil = db_util .DB_Util ()
15
+ self .outputFile = file_output .FileOutPut ()
14
16
15
17
# 本地方法需要先定义在使用,也就是定义得放在前面不然无法调用
16
18
def _get_new_urls (self , page_url , soup ):
@@ -41,20 +43,20 @@ def _get_new_data(self, page_url, soup):
41
43
42
44
# 文章数据解析
43
45
44
- def _parse_chapter (self , chapter_url ):
46
+ def _parse_chapter (self , chapter_url , info_url ):
45
47
if chapter_url is None :
46
48
return
47
49
html_cont = self .downloader .download (chapter_url )
48
50
if html_cont is None :
49
51
return
50
52
soup = BeautifulSoup (html_cont , 'html.parser' , from_encoding = 'utf-8' )
51
53
# new_urls = self._get_new_urls(chapter_url, soup) # self调本地方法,通过url
52
- content_data = self ._parse_content_data (chapter_url , soup )
54
+ content_data = self ._parse_content_data (info_url , chapter_url , soup )
53
55
54
56
if content_data is not None :
55
57
sql = "insert into t_content(chapterurl, content, nexturl, preurl) values " \
56
58
"('%s', '%s', '%s', '%s')" % \
57
- (content_data ['chapterUrl' ], content_data ['content' ], content_data ['nextUrl' ],
59
+ (content_data ['chapterUrl' ], content_data ['content' ]. encode ( 'utf-8' ) , content_data ['nextUrl' ],
58
60
content_data ['preUrl' ])
59
61
self .dbutil .insert (sql )
60
62
@@ -110,35 +112,40 @@ def _parse_chapter_data(self, page_url, soup):
110
112
chapter_list = []
111
113
if page_url .find ("info" ) == - 1 :
112
114
return None
113
- count = 0
115
+ count = 1
114
116
# 匹配title <dd class="lemmaWgt-lemmaTitle-title">
115
117
chapter_list_node = soup .find ('div' , class_ = "volume-wrap" )
116
118
for chapter in chapter_list_node .find_all ('li' ):
119
+ if count > 1 :
120
+ return chapter_list
117
121
chapter_data = {}
118
122
chapter_data ['info_url' ] = page_url
119
123
chapter_data ['chapterNum' ] = count
120
124
chapter_data ['chapterName' ] = chapter .get_text ()
121
125
chapter_data ['chapterUrl' ] = chapter .find ('a' ).get ('href' )
122
126
# join 方法会按照pageurl的格式将new_url补全
123
127
chapter_data ['chapterUrl' ] = urlparse .urljoin (page_url , chapter_data ['chapterUrl' ])
128
+ count = count + 1
124
129
try :
125
130
if chapter .find ('a' ).get ('href' ) is not None :
126
- self ._parse_chapter (chapter_data ['chapterUrl' ]) # 解析内容数据
127
- except :
128
- print "parse_chapter() fail "
131
+ self ._parse_chapter (chapter_data ['chapterUrl' ], page_url ) # 解析内容数据
132
+ except Exception , e :
133
+ print "parse_chapter() fail:" + e
129
134
chapter_list .append (chapter_data )
130
- return chapter_data
135
+ return chapter_list
131
136
132
137
# 内容信息解析 文章内容,章节url,上一章节url,下一章节url
133
- def _parse_content_data (self , page_url , soup ):
138
+ def _parse_content_data (self , info_url , chapter_url , soup ):
134
139
# self._get_new_urls(page_url, soup)
135
140
content_data = {}
136
- if page_url .find ("chapter" ) == - 1 :
141
+ if chapter_url .find ("chapter" ) == - 1 :
137
142
return None
138
- content_data ['chapterUrl' ] = page_url
143
+ content_data ['chapterUrl' ] = chapter_url
139
144
content_node = soup .find ('div' , class_ = "read-content j_readContent" )
140
145
# content_data['content'] = content_node.get_text()
141
- content_data ['content' ] = "hahah"
146
+ filepath = self .outputFile .file_output (content_node .get_text (), info_url .split ("/" ).pop (),
147
+ chapter_url .split ("/" ).pop ())
148
+ content_data ['content' ] = filepath
142
149
# p_nodes = content_node.find_all('p')
143
150
# for content in p_nodes:
144
151
# content_data['content'] = content_data['content'] + '\\n' + content.get_text()
@@ -148,10 +155,10 @@ def _parse_content_data(self, page_url, soup):
148
155
chapter_node = soup .find ('div' , class_ = "chapter-control dib-wrap" )
149
156
for a in chapter_node .find_all ('a' ):
150
157
if a .get ('id' ) is not None and a .get ('id' ).find ("j_chapterPrev" ) != - 1 :
151
- content_data ['preUrl' ] = a .get ('href' )
158
+ content_data ['preUrl' ] = urlparse . urljoin ( chapter_url , a .get ('href' ) )
152
159
continue
153
160
if a .get ('id' ) is not None and a .get ('id' ).find ("j_chapterNext" ) != - 1 :
154
- content_data ['nextUrl' ] = a .get ('href' )
161
+ content_data ['nextUrl' ] = urlparse . urljoin ( chapter_url , a .get ('href' ) )
155
162
continue
156
163
return content_data
157
164
0 commit comments