forked from dataaug/zhihu-spider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathZhihuSpider.py
184 lines (164 loc) · 7.79 KB
/
ZhihuSpider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# -*- coding: utf-8 -*-
"""
Module name: ZhihuSpider;
Author: Duguce;
Description: 抓取知乎某一问题下的所有回答(回答数量不超过800左右)
"""
import datetime
import time, json, re
from time import sleep
import pandas as pd
import config
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
DEBUG = False
def get_html(url):
driver = get_driver(url)
# 隐式等待
driver.implicitly_wait(10)
# 浏览器最大化
driver.maximize_window()
driver.get(url)
if not DEBUG: time.sleep(random.uniform(3, 4))
# 定位登录界面关闭按钮
close_btn = driver.find_element(By.XPATH, "//button[@class='Button Modal-closeButton Button--plain']")
# 点击登录界面关闭按钮
close_btn.click()
scroll_to_bottom(driver)
answerElementList = driver.find_elements(By.CSS_SELECTOR, "#QuestionAnswers-answers .List-item .ContentItem")
return answerElementList, driver
def get_driver(url):
# 输入需要爬取知乎回答的问题链接
# url = input('请输入需要爬取知乎回答的问题链接:\n')
chrome_options = Options()
chrome_options.add_argument('--headless')
# 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--disable-gpu')
# # 禁止图片和CSS加载,减小抓取时间
# prefs = {'profile.default_content_setting_values': { 'images': 2, 'javascript': 2,
# 'plugins': 2,}}
prefs = {'profile.default_content_setting_values': { 'images': 2, }}
chrome_options.add_experimental_option("prefs", prefs)
# 禁止图片和CSS加载,减小抓取时间
# firefox_profile = webdriver.ChromeOptions()
# firefox_profile.set_preference('permissions.default.image', 2)
# firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
# firefox_profile.set_preference('permissions.default.stylesheet', 2)
# 打开浏览器 mac 如果是win,在chromedriver后添加.exe
s = Service("Driver/chromedriver")
driver = webdriver.Chrome(service=s, options=chrome_options)
return driver
def scroll_to_bottom(driver):
# 获取当前窗口的总高度
js = 'return action=document.body.scrollHeight'
# 初始化滚动条所在的高度
height = 0
# 当前窗口总高度
currHeight = driver.execute_script(js)
while height < currHeight:
# 将滚动条调整至页面底端
for i in range(height, currHeight, 100):
driver.execute_script("window.scrollTo(0, {})".format(i))
if not DEBUG: time.sleep(0.02)
height = currHeight
currHeight = driver.execute_script(js)
if not DEBUG: time.sleep(3)
def get_answers(answerElementList, url):
# 定义一个存储回答中的信息的数据表格
answerData = pd.DataFrame(
columns=(
'question_title', 'answer_url', 'question_url', 'author_name', 'fans_count', 'created_time', 'updated_time',
'comment_count',
'voteup_count', 'content'))
numAnswer = 0
# 遍历每一个回答并获取回答中的信息
for answer in answerElementList:
dictText = json.loads(answer.get_attribute('data-zop'))
question_title = dictText['title'] # 问题名称
answer_url = answer.find_element(By.XPATH,
"meta[@itemprop='url' and contains(@content, 'answer')]").get_attribute(
'content') # 获取回答的链接
author_name = dictText['authorName'] # 回答作者名称
fans_count = answer.find_element(By.XPATH, "*//meta[contains(@itemprop, 'followerCount')]").get_attribute(
'content') # 获取粉丝数量
created_time = answer.find_element(By.XPATH, "meta[@itemprop='dateCreated']").get_attribute(
'content') # 获取回答的创建时间
updated_time = answer.find_element(By.XPATH, "meta[@itemprop='dateModified']").get_attribute(
'content') # 获取回答最近的编辑时间
comment_count = answer.find_element(By.XPATH, "meta[@itemprop='commentCount']").get_attribute(
'content') # 获取该回答的评论数量
voteup_count = answer.find_element(By.XPATH, "meta[@itemprop='upvoteCount']").get_attribute(
'content') # 获取回答的赞同数量
contents = answer.find_elements(By.TAG_NAME, "p")# .text.replace("\n", "") # 回答内容
# print(contents.text)
content = ''.join([content.text for content in contents])
# print(answer)
# content = ''
if not DEBUG: time.sleep(0.001)
row = {'question_title': [question_title],
'author_name': [author_name],
'question_url': [url],
'answer_url': [answer_url],
'fans_count': [fans_count],
'created_time': [created_time],
'updated_time': [updated_time],
'comment_count': [comment_count],
'voteup_count': [voteup_count],
'content': [content]
}
answerData = answerData.append(pd.DataFrame(row), ignore_index=True)
numAnswer += 1
print(f"[NORMAL] 问题:【{question_title}】 的第 {numAnswer} 个回答抓取完成...")
if not DEBUG: time.sleep(0.2)
return answerData, question_title
if __name__ == '__main__':
# 获取当前已经抓取的所有问题
try:
df_tmp = pd.read_csv('zhihu_result.csv')
question_url_contained = set(df_tmp['question_url'].to_list())
del df_tmp
except Exception as e:
print('no breakpoint:', e)
question_url_contained = set()
# 创建表头
answerData_init = pd.DataFrame(
columns=(
'question_title', 'answer_url', 'question_url', 'author_name', 'fans_count', 'created_time', 'updated_time',
'comment_count',
'voteup_count', 'content'))
answerData_init.to_csv(f'zhihu_result.csv', mode='a', encoding='utf-8', index=False, header = True)
print('需要抓取的问题数量:', len(config.urls))
for url in config.urls:
if url in question_url_contained:
continue
print('----------------------------------------')
print('url:', url)
# https://www.zhihu.com/question/20000010
url_num = int(url.split('/')[-1])
try:
if not DEBUG: time.sleep(random.uniform(60, 120))
answerElementList, driver = get_html(url)
print("[NORMAL] 开始抓取该问题的回答...")
answerData, question_title = get_answers(answerElementList, url)
print(f"[NORMAL] 问题:【{question_title}】 的回答全部抓取完成...")
if not DEBUG: time.sleep(random.uniform(1, 3))
question_title = re.sub(r'[\W]', '', question_title)
filename = str(f"result-{datetime.datetime.now().strftime('%Y-%m-%d')}-{question_title}")
# answerData.to_csv(f'{config.results_path}/{filename}.csv', encoding='utf-8', index=False)
# 并入总表
# answerData_all = answerData_all.append(pd.DataFrame(answerData), ignore_index=True)
#if url_num % 100 == 0:
# 并入总表
answerData.to_csv(f'zhihu_result.csv', mode='a', encoding='utf-8', index=False, header = False)
print(f"[NORMAL] 问题:【{question_title}】 的回答已经保存至 {filename}.xlsx...")
driver.close()
# print(e)
# print(f"[ERROR] 抓取失败...")
except Exception as e:
if not DEBUG: time.sleep(random.uniform(300, 400))
print(e)
print(f"[ERROR] 抓取失败...")
continue