diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..af7c512
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/../../../../:\PycharmProjects\知乎\.idea/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
diff --git a/.idea/dataSources/f96628bb-436c-4786-b8df-bd36cf019416.xml b/.idea/dataSources/f96628bb-436c-4786-b8df-bd36cf019416.xml
new file mode 100644
index 0000000..507037d
--- /dev/null
+++ b/.idea/dataSources/f96628bb-436c-4786-b8df-bd36cf019416.xml
@@ -0,0 +1,13 @@
+
+
+
+
+ 4.4.1
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..66971cb
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,20 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..a2e120d
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..1b8c282
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git "a/.idea/\347\237\245\344\271\216.iml" "b/.idea/\347\237\245\344\271\216.iml"
new file mode 100644
index 0000000..d0876a7
--- /dev/null
+++ "b/.idea/\347\237\245\344\271\216.iml"
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8a4ee49
Binary files /dev/null and b/README.md differ
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..1a1bb22
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,3 @@
+"""
+ @desc 知乎爬虫 结构:话题广场--话题--提问--回答--评论 以及独立的用户信息
+"""
\ No newline at end of file
diff --git a/error_log.txt b/error_log.txt
new file mode 100644
index 0000000..e69de29
diff --git a/frame/SpyderFrame.py b/frame/SpyderFrame.py
new file mode 100644
index 0000000..edf2484
--- /dev/null
+++ b/frame/SpyderFrame.py
@@ -0,0 +1,233 @@
+"""
+ @version: v0.3 dev
+ @desc: 通用爬虫框架
+ @update_log: v0.1 初始架构Proxies代理模块、UrlManager、HtmlDownloader、HtmlParser、DataSaver
+ v0.2 加入MongoDB存储功能,支持MongoDB自增ID
+ v0.3 加入Redis支持,UrlManager使用Redis运行大型项目可以断点续爬,DataSaver使用Redis解决硬盘I/O低影响爬虫速度
+"""
+import threading
+import pandas as pd
+import requests
+import time
+import redis
+import socket
+
+redis = redis.Redis()
+
+
+# 代理线程
+class Proxies(threading.Thread):
+
+ def __init__(self):
+ super().__init__()
+ # 线程运行标志
+ self.__thread__flag = True
+ self.get_proxies_api = "http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId" \
+ "=192b9425f13c47ffbbe4a663c974608b&orderno=YZ2020219595449Wzor&returnType=2&count=1 "
+ self.Proxies = {
+ "http": "",
+ "https": ""
+ }
+
+ # 结束线程
+ def __exit__(self):
+ self.__thread__flag = False
+
+ # 如果代理失效,通知进程主动更新代理
+ def get_proxies(self):
+ i = 0
+ for i in range(5):
+ print("第" + str(i + 1) + "次获取代理。。。")
+ res = requests.get(self.get_proxies_api)
+ j = eval(res.text)
+ if j['ERRORCODE'] == '0':
+ self.Proxies['http'] = "http://" + j['RESULT'][0]['ip'] + ":" + j['RESULT'][0]['port']
+ self.Proxies['https'] = "http://" + j['RESULT'][0]['ip'] + ":" + j['RESULT'][0]['port']
+ return
+ time.sleep(1.2)
+ if i == 4:
+ print("获取代理失败,程序退出。。。")
+ exit(1)
+
+ # 监测代理时间。如果超时更新代理
+ def run(self) -> None:
+ start_time = time.time()
+ while self.__thread__flag:
+ # 设置代理生存时间为60s
+ if start_time - time.time() > 60:
+ # 重设代理使用时长
+ start_time = time.time()
+ self.get_proxies()
+ time.sleep(3)
+
+
+class UrlManager(object):
+ """url管理"""
+
+ # 初始化url池
+ def __init__(self, db_set_name='', use_redis=False):
+ """支持Redis队列解决断点续爬功能,需指定参数use_redis=True
+
+ :param db_set_name str Redis队列数据库名,默认为空
+ """
+ self.use_redis = use_redis
+ self.db_set_name = db_set_name
+ if not use_redis:
+ self.url_list = []
+ self.url_set = set()
+ return
+
+ # 定义插入url方法
+ def add_url(self, url: str) -> None:
+ if not self.use_redis:
+ if url not in self.url_set:
+ self.url_set.add(url)
+ self.url_list.append(url)
+ elif redis.sadd("set_" + self.db_set_name, url): # 如果插入成功,会返回数据量
+ redis.rpush("list_" + self.db_set_name, url) # 列表尾部插入
+
+ # 从队列头部提取url
+ def get(self) -> str:
+ if not self.use_redis:
+ return self.url_list.pop(0)
+ return redis.lpop("list_" + self.db_set_name).decode("utf-8") # 列表头部pop
+
+ # 队列还有URL吗
+ def not_complete(self) -> bool:
+ if not self.use_redis and len(self.url_list):
+ return True
+ elif redis.llen("list_" + self.db_set_name) != 0:
+ return True
+ return False
+
+
+# 页面资源下载
+class HtmlDownloader(threading.Thread):
+
+ def __init__(self):
+ # 实例化Proxies类
+ super().__init__()
+ self.proxies = Proxies()
+ # 启动代理线程
+ self.proxies.start()
+ # 默认请求头
+ self.headers = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/85.0.4183.102 Safari/537.36 Edg/85.0.564.51 "
+ }
+ socket.setdefaulttimeout(10) # 设置超时
+
+ def download(self, url: str, params={}) -> str:
+ with open("../error_log.txt", "w") as err_log:
+ for i in range(3):
+ try:
+ res = requests.get(url, params=params, headers=self.headers, proxies=self.proxies.Proxies, timeout=3)
+ if res.status_code == 200:
+ return res.text
+ # 非200,更换代理,抛出异常
+ self.proxies.get_proxies()
+ res.raise_for_status()
+ # 记录异常
+ except requests.exceptions.HTTPError:
+ print(url + "; HTTPError; Code " + str(res.status_code))
+ err_log.write(url + "; HTTPError; Code " + str(res.status_code))
+ except requests.exceptions.Timeout:
+ print(url + "; Timeout")
+ err_log.write(url + "; Timeout")
+ except Exception:
+ print(url + "; Other Error")
+ err_log.write(url + "; Other Error")
+ self.proxies.get_proxies()
+ print("downloading error , retrying.....{},3", i+1)
+ raise requests.exceptions.RetryError
+
+
+# html解析,需要在主函数中重写
+class HtmlParser(object):
+ def __init__(self):
+ return
+
+ def html_parser(self):
+ return
+
+
+class DataSaver(threading.Thread):
+
+ def __init__(self, db_name='', set_name='', use_auto_increase_index=False, use_redis=False):
+ """若要使用Redis缓存数据,指定参数use_redis=True \n使用MongoDB自增ID,指定use_auto_increase_index=True
+ :param db_name: str 可选 要存储的MongoDB数据库名称
+ :param set_name: str 可选 要存储的MongoDB集合名
+ :func run: 采用run同步Redis与Mongo数据
+ """
+ super().__init__()
+ import pymongo
+ mg_client = pymongo.MongoClient("mongodb://localhost:27017/")
+
+ self.db_name = db_name
+ self.set_name = set_name
+ self.use_auto_increase_index = use_auto_increase_index
+ self.__tread__flag = True
+ self.use_redis = use_redis
+
+ self.mg_client_counter = mg_client["counter"]
+ self.mg_client_data = mg_client[db_name]
+ self.mg_data_db = self.mg_client_data[set_name]
+ self.mg_counter_db = self.mg_client_counter[db_name + "@" + set_name]
+ self.nextId = None
+ if use_auto_increase_index: # 使用自增ID
+ if db_name + "@" + set_name in self.mg_client_counter.list_collection_names():
+ return
+ else:
+ self.mg_counter_db.insert({
+ "_id": "_id",
+ "index": 0
+ })
+
+ def __exit__(self):
+ self.__tread__flag = False
+
+ # csv存储
+ @staticmethod
+ def to_csv(data: list, file_name: str, encoding: str = "utf-8") -> None:
+ """存储到CSV
+
+ :param data: dict in list 数据集
+ :param file_name: str 文件路径
+ :param encoding: default "utf-8"
+
+ """
+ pd.DataFrame(data).to_csv(file_name, encoding=encoding)
+
+ # MongoDB自增ID
+ def getNextId(self) -> None:
+ self.nextId = self.mg_counter_db.find_one_and_update({"_id": '_id'}, {"$inc": {"index": 1}})['index']
+
+ def redis_temp(self, data_dict: dict) -> None:
+ """数据缓存到Redis 如果使用此函方法请确保实例化DataSaver时指定了use_redis=True
+ :param data_dict: dict 数据集合
+ """
+ # 有序集合
+ redis.sadd("data_" + self.db_name + "@" + self.set_name, str(data_dict))
+
+ def mongo_insert(self, data_dict: dict) -> None:
+ """向MongoDB直接插入数据,不经过Redis缓存
+ :param data_dict: dict 数据集合
+ """
+ if self.use_auto_increase_index: # 使用自增ID
+ self.getNextId()
+ data_dict.update({"_id": self.nextId})
+ self.mg_data_db.insert(data_dict)
+
+ def run(self):
+ """Redis缓存数据同步到MongoDB, 请在主程序结束后调用本对象的__exit__方法结束该线程"""
+ # 只有在redis缓存数据为空,并且主程序退出的时候才会结束
+ while redis.scard("data_" + self.db_name + "@" + self.set_name) or self.__tread__flag:
+ data = redis.spop("data_" + self.db_name + "@" + self.set_name)
+ if data:
+ data = eval(data.decode("UTF-8"))
+ if self.use_auto_increase_index: # 使用自增ID
+ self.getNextId()
+ data.update({"_id": self.nextId})
+ self.mg_data_db.insert(data)
+ # 没有数据,休息一会
+ time.sleep(1)
diff --git a/tools/KeyWordsSearch.py b/tools/KeyWordsSearch.py
new file mode 100644
index 0000000..558d150
--- /dev/null
+++ b/tools/KeyWordsSearch.py
@@ -0,0 +1,161 @@
+from frame import SpyderFrame
+import json
+
+KWD = ''
+
+
+class HTMLParser(SpyderFrame.HtmlParser):
+
+ def __init__(self, get_detail):
+ super().__init__()
+ self.get_detail = get_detail
+ if get_detail:
+ self.url_manager = SpyderFrame.UrlManager(db_set_name='知乎@' + KWD)
+
+ def parser(self, data_list: list) -> dict:
+ for data in data_list:
+ _type = data['type']
+ if _type == 'knowledge_ad':
+ yield self._knowledge_ad(data)
+ elif _type == "wiki_box":
+ yield self._wiki_box(data)
+ elif _type == 'search_result':
+ if data['object']['type'] == "answer":
+ yield self._search_result_answer(data)
+ else:
+ print(data)
+ elif _type == "relevant_query" or "multi_answers" or "search_club" or "video_box":
+ continue
+ else:
+ print(data)
+
+ def _knowledge_ad(self, data):
+ self._find_new_url(data['object']['url'])
+ authors = data["object"]["body"]["authors"]
+ for i in range(len(authors)):
+ authors[i].pop("icon")
+ return {
+ "type": "knowledge_ad",
+ "id": data["id"],
+ "title": data["object"]["body"]["title"],
+ "authors": authors,
+ "description": data["object"]["body"]["description"],
+ # "commodity_type": data["object"]["body"]["commodity_type"],
+ "footer": data["object"]["footer"],
+ "url": data['object']['url']
+ }
+
+ def _search_result_answer(self, data):
+ self._find_new_url("https://www.zhihu.com/question/" + data['object']['question']['url'].split('/')[-1])
+ return {
+ "id": data["object"]["id"],
+ "q_id": data["object"]["question"]["id"],
+ "type": "search_result_answer",
+ "author": data["object"]["author"],
+ "q_name": data["object"]["question"]["name"],
+ "content": data["object"]["content"],
+ "excerpt": data["object"]["excerpt"],
+ "created_time": data["object"]["created_time"],
+ "updated_time": data["object"]["updated_time"],
+ "comment_count": data["object"]["comment_count"],
+ "voteup_count": data["object"]["voteup_count"],
+ "q_url": "https://www.zhihu.com/question/" + data['object']['question']['url'].split('/')[-1]
+ }
+
+ def _wiki_box(self, data):
+ data = data['object']
+ self._find_new_url("https://www.zhihu.com/topic/" + data['url'].split('/')[-1])
+ return {
+ "id": data["id"],
+ "aliases": data['aliases'],
+ "discussion_count": data["discussion_count"],
+ "essence_feed_count": data["essence_feed_count"],
+ "excerpt": data["excerpt"],
+ "follower_count": data["follower_count"],
+ "followers_count": data["followers_count"],
+ "introduction": data["introduction"],
+ "questions_count": data["questions_count"],
+ "top_answer_count": data["top_answer_count"],
+ "type": "wiki_box",
+ "url": "https://www.zhihu.com/topic/" + data['url'].split('/')[-1]
+ }
+
+ def _find_new_url(self, url):
+ if self.get_detail:
+ self.url_manager.add_url(url)
+ return
+
+ def _search_result_article(self, data):
+ return
+
+ def _search_result_question(self, data):
+ return
+
+
+def search(keyword):
+ global KWD
+ KWD = keyword
+ base_url = 'https://api.zhihu.com/search_v3'
+ html_downloader = SpyderFrame.HtmlDownloader()
+ data_saver = SpyderFrame.DataSaver(db_name='知乎', set_name=keyword)
+ html_downloader.headers = {
+ "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,"
+ "application/signed-exchange;v=b3;q=0.9",
+ "accept-encoding": "gzip, deflate, br",
+ "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
+ "cache-control": "no-cache",
+ "cookie": "d_c0=\"AADWzpXyDxKPTkP_r53qvH9ipDf4dAG7XE4=|1603087870\"; "
+ "_zap=b47b3886-7c4a-4101-9ee5-4c803bcf6cd8; _xsrf=LRWrd8I0FyQr3hxZ49tYEABlJI0MFizY; "
+ "capsion_ticket=\"2|1:0|10:1603262862|14:capsion_ticket|44"
+ ":N2UxNWE4YzExZWYxNDUwYWFkZjM4MjQ4MDhjNWExNjY"
+ "=|fa44c3793ac9cf5fac96aab9dc9d8faadba2d384e00351c9c9642028ceace6ad\"; "
+ "r_cap_id=\"YmY4MWY5YzA0OWRlNDk0Yjk2MTEyYWEzZDU5MjZmMmM=|1603262864"
+ "|9dbd3b9caeccd1669c26ee92e5b543543a611713\"; "
+ "cap_id=\"OGVlYjJjOTQ2YTgyNGMzZTlmODk4NDUzMzQ0ZTkyNjA=|1603262864"
+ "|5e52e69215700dd4539d66e5a0833dd4a0c4c1fe\"; "
+ "l_cap_id=\"ODViMjY0YmExNWNlNGVmYWJmMGY5MGUyNTUzMjQxMzM=|1603262864"
+ "|8a107e67c1f9223cd88f066cda42b6ce2102b632\"; "
+ "z_c0=Mi4xQnNEUkNBQUFBQUFBQU5iT2xmSVBFaGNBQUFCaEFsVk5saWQ5WUFERVEzVUJpOVdzZHRZcnloaE9OZWVXVDZwTlhR"
+ "|1603262870|42b123d5ae8b1fb74a8815b13eae8cb34f92508c; tst=r; "
+ "q_c1=582f701a20454c59be03f2470d62b194|1603326280000|1603326280000; "
+ "Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1603653130,1603680022,1603682173,1603683176; "
+ "Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1603683176; "
+ "KLBRSID=af132c66e9ed2b57686ff5c489976b91|1603684342|1603684271",
+ "pragma": "no-cache",
+ "sec-fetch-dest": "document",
+ "sec-fetch-mode": "navigate",
+ "sec-fetch-site": "none",
+ "sec-fetch-user": "?1",
+ "upgrade-insecure-requests": "1",
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.51 "
+ }
+ prams = {
+ "advert_count": "0",
+ "correction": "1",
+ "lc_idx": "0",
+ "limit": "20",
+ "offset": "0",
+ "q": keyword,
+ "search_hash_id": "1e3c9a021028e71019c7977637948651",
+ "show_all_topics": "0",
+ "t": "general",
+ "vertical_info": "0,1,0,0,0,0,0,0,0,2"
+ }
+ html_parser = HTMLParser(get_detail=0)
+ res = html_downloader.download(url=base_url, params=prams)
+ while True:
+ res = json.loads(res)
+ for data in html_parser.parser(res['data']):
+ data_saver.mongo_insert(data)
+ if res['paging']['is_end']:
+ break
+ next_url = res['paging']['next']
+ res = html_downloader.download(next_url)
+ # exit
+ html_downloader.proxies.__exit__()
+
+
+if __name__ == '__main__':
+ kwd = input("请输入搜索关键词:")
+ search(kwd)
diff --git a/utils/question.py b/utils/question.py
new file mode 100644
index 0000000..6d982db
--- /dev/null
+++ b/utils/question.py
@@ -0,0 +1,70 @@
+"""
+ @author 满目皆星河
+ @creat_date 2020/10/06
+ @update_data 2020/10/06
+ @desc 知乎回答爬虫,爬某个提问下面全部的回答以及回答的详细信息, 存入MongoDB.知乎.questions
+ @main_function spyder(question_id: str)
+"""
+
+import json
+from frame import SpyderFrame
+
+URL_MANAGER = SpyderFrame.UrlManager()
+
+
+class HtmlParser(SpyderFrame.HtmlParser):
+
+ @staticmethod
+ def question_json_parser(question_text: str) -> list:
+ # 格式化,str转字典
+ question_json = json.loads(question_text)
+
+ # 游标,下个数据包URL
+ if not question_json["paging"]["is_end"]:
+ next_url = question_json["paging"]["next"]
+ URL_MANAGER.add_url(next_url)
+
+ # 解析json里的data数据包
+ data_results = question_json["data"]
+ for i in range(len(data_results)):
+ # 修正ID格式,为MongoDB的索引格式
+ data_results[i].update({"_id": data_results[i].pop("id")})
+ yield data_results[i]
+
+
+def spyder(question_id: str):
+ """
+ :input str in list 列表内嵌套字符串
+ """
+ base_url_start = "https://www.zhihu.com/api/v4/questions/"
+ base_url_end = "/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed" \
+ "%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by" \
+ "%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count" \
+ "%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info" \
+ "%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting" \
+ "%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B" \
+ "%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics" \
+ "&limit=5&offset=0"
+ html_parser = HtmlParser()
+ html_downloader = SpyderFrame.HtmlDownloader()
+ data_saver = SpyderFrame.DataSaver(db_name="知乎", set_name="questions")
+
+ # 初始化URL队列
+ URL_MANAGER.add_url(url=base_url_start + question_id + base_url_end)
+
+ while URL_MANAGER.not_complete():
+ url = URL_MANAGER.get()
+ for data in html_parser.question_json_parser(html_downloader.download(url)):
+ data_saver.mongo_insert(data_dict=data)
+
+ # 结束线程
+ html_downloader.proxies.__exit__()
+
+
+if __name__ == '__main__':
+ question_list = []
+ for question in input("请输入问题ID,多个question_id请用英文逗号分隔:").split(","):
+ question_list.append(question.strip())
+
+ for question in question_list:
+ spyder(question)
diff --git a/utils/user.py b/utils/user.py
new file mode 100644
index 0000000..0d69c36
--- /dev/null
+++ b/utils/user.py
@@ -0,0 +1,17 @@
+from frame import SpyderFrame
+
+
+class JsonParser(SpyderFrame.HtmlParser):
+ def __init__(self):
+ super.__init__()
+
+ # def user_answered(self):
+
+
+if __name__ == '__main__':
+ USER_BASE_URL = "https://www.zhihu.com/people/"
+ JSON_PARSER = JsonParser()
+ URL_MANAGER = SpyderFrame.UrlManager("zhihu")
+ URL_MANAGER.add_url(USER_BASE_URL + "dear-w-34")
+ HTML_DOWNLOADER = SpyderFrame.HtmlDownloader()
+ DATA_SAVER = SpyderFrame.DataSaver("知乎", "user")
diff --git a/utils/wiki_box.py b/utils/wiki_box.py
new file mode 100644
index 0000000..0b04d81
--- /dev/null
+++ b/utils/wiki_box.py
@@ -0,0 +1,16 @@
+"""
+ @author 满目皆星河
+ @creat_date 2020/10/21
+ @update_data 2020/10/21
+ @desc 知乎百科数据,知乎某些question被收录进知乎百科里,这里仅保留百科词条,question具体信息有专门处理的文件(utils/question.py)
+ @main_function spyder(question_id: str)
+"""
+
+
+def parser(text: str):
+ """
+ 解析wiki_box内容
+ :param text: html数据
+ :return data: wiki处理完成的json数据
+ """
+
\ No newline at end of file