Skip to content

Commit c3f6841

Browse files
author
宋智航
committed
“创建python项目”
0 parents  commit c3f6841

File tree

8 files changed

+153
-0
lines changed

8 files changed

+153
-0
lines changed

baike_bug/__init__.py

Whitespace-only changes.

baike_bug/bug_main.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# coding:utf8
2+
from baike_bug import url_manager, html_downloader, html_parser, html_output
3+
4+
5+
class BugMain(object):
6+
def __init__(self):
7+
self.urls = url_manager.UrlManager()
8+
self.downloader = html_downloader.HtmlDownloader()
9+
self.parser = html_parser.HtmlParser()
10+
self.outputer = html_output.HtmlOutputer()
11+
12+
def craw(self, root_url):
13+
count = 1
14+
self.urls.add_new_url(root_url)
15+
while self.urls.has_new_url():
16+
try:
17+
new_url = self.urls.get_new_url()
18+
print 'craw %d : %s' % (count, new_url)
19+
html_cont = self.downloader.download(new_url)
20+
new_urls, new_data = self.parser.parse(new_url, html_cont)
21+
self.urls.add_new_urls(new_urls)
22+
self.outputer.collect_data(new_data)
23+
if count == 1000:
24+
break
25+
count = count + 1
26+
except:
27+
print 'craw fail'
28+
self.outputer.output_html()
29+
30+
31+
if __name__ == "__main__":
32+
root_url = "https://baike.baidu.com/item/Python/407313?fr=aladdin"
33+
ojb_bug = BugMain()
34+
ojb_bug.craw(root_url)

baike_bug/html_downloader.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# coding:utf8
2+
import urllib2
3+
4+
5+
class HtmlDownloader(object):
6+
def download(self, url): # 要下载的url
7+
if url is None:
8+
return None
9+
response = urllib2.urlopen(url)
10+
if response.getode() != 200:
11+
return None
12+
13+
return response.read()

baike_bug/html_output.py

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# coding:utf8
2+
class HtmlOutputer(object):
3+
def collect_data(self, new_data):
4+
pass
5+
6+
def output_html(self):
7+
pass

baike_bug/html_parser.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# coding:utf8
2+
import re
3+
import urlparse
4+
5+
from bs4 import BeautifulSoup
6+
7+
8+
class HtmlParser(object):
9+
def parse(self, page_url, html_cont):
10+
if page_url is None or html_cont is None:
11+
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
12+
new_urls = self._get_new_urls(page_url, soup)
13+
new_data = self._get_new_data(page_url, soup)
14+
return new_urls, new_data
15+
16+
def _get_new_urls(self, page_url, soup):
17+
new_urls = set()
18+
links = soup.find_all('a', href=re.compile(r"/view/\d+\.html"))
19+
for link in links:
20+
new_url = link['href']
21+
new_full_url = urlparse.urljoin(page_url, new_url)
22+
new_urls.add(new_full_url)
23+
return new_urls
24+
25+
def _get_new_data(self, page_url, soup):
26+
res_data = {}
27+
title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find()
28+
res_data['title'] = title_node.get_text()
29+
summary_node = soup.find('div', class_="lemma-summary")
30+
res_data['summary'] = summary_node.get_text()
31+
return res_data

baike_bug/url_manager.py

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# coding:utf8
2+
class UrlManager(object):
3+
def __init__(self):
4+
self.new_urls = set() # 待爬取
5+
self.old_urls = set() # 爬取过
6+
7+
def add_new_url(self, url):
8+
if url is None:
9+
return
10+
if url not in self.new_urls and url not in self.old_urls:
11+
self.new_urls.add(url)
12+
13+
def has_new_url(self):
14+
return len(self.new_urls) != 0
15+
pass
16+
17+
def add_new_urls(self, urls):
18+
if urls is None or len(urls) == 0:
19+
return
20+
for url in urls:
21+
self.add_new_url(url)
22+
23+
def get_new_url(self):
24+
new_url = self.new_urls.pop() # 会获取然后移除
25+
self.old_urls.add(new_url)
26+
return new_url

test/__init__.py

Whitespace-only changes.

test/my_test.py

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# coding:utf8
2+
import re, urllib2, bs4
3+
4+
from bs4 import BeautifulSoup
5+
6+
print bs4
7+
8+
# url = 'www.baidu,com'
9+
#
10+
# urllib2.urlopen(url, "", 100000)
11+
#
12+
# re.findall("")
13+
14+
html_doc = """ <div class="J-next-auto hide next-auto"><em>3</em> 秒后播放下一节</div>
15+
<div class="J-next-btn hide next-auto btn btn-green">下一节</div>
16+
<a href="/video/10687/0" class="review-course">重新观看</a>
17+
18+
<div id="js-ques-box"></div> </div>
19+
20+
</div>
21+
"""
22+
soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')
23+
print '获取链接'
24+
links = soup.find_all('a')
25+
for link in links:
26+
print link.name, link['href'], link.get_text()
27+
28+
print '获取指定url'
29+
link_node = soup.find('a', href='/video/10687/0')
30+
print link_node.name, link_node['href'], link_node.get_text()
31+
32+
print '正则匹配'
33+
link_node = soup.find('a', href=re.compile(r'video'))
34+
print link_node.name, link_node['href'], link_node.get_text()
35+
36+
print '获取div'
37+
link_node = soup.find('div', class_='J-next-auto hide next-auto')
38+
print link_node.name, link_node.get_text()
39+
40+
print '获取div正则'
41+
link_node = soup.find('div', id=re.compile(r'ques')) # 正则可以模糊匹配
42+
print link_node.name, link_node['id'], link_node.get_text() # 如果匹配结果为空的时候输出会报错

0 commit comments

Comments
 (0)