From 4375f71c3675b69187b43c5181668f183331d841 Mon Sep 17 00:00:00 2001 From: HanZhuoii Date: Tue, 27 Oct 2020 00:39:43 +0800 Subject: [PATCH] Building --- .idea/.gitignore | 8 + .../f96628bb-436c-4786-b8df-bd36cf019416.xml | 13 + .idea/inspectionProfiles/Project_Default.xml | 20 ++ .../inspectionProfiles/profiles_settings.xml | 6 + .idea/misc.xml | 4 + .idea/modules.xml | 8 + ".idea/\347\237\245\344\271\216.iml" | 8 + README.md | Bin 0 -> 34 bytes __init__.py | 3 + error_log.txt | 0 frame/SpyderFrame.py | 233 ++++++++++++++++++ tools/KeyWordsSearch.py | 161 ++++++++++++ utils/question.py | 70 ++++++ utils/user.py | 17 ++ utils/wiki_box.py | 16 ++ 15 files changed, 567 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/dataSources/f96628bb-436c-4786-b8df-bd36cf019416.xml create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 ".idea/\347\237\245\344\271\216.iml" create mode 100644 README.md create mode 100644 __init__.py create mode 100644 error_log.txt create mode 100644 frame/SpyderFrame.py create mode 100644 tools/KeyWordsSearch.py create mode 100644 utils/question.py create mode 100644 utils/user.py create mode 100644 utils/wiki_box.py diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..af7c512 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/../../../../:\PycharmProjects\知乎\.idea/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/dataSources/f96628bb-436c-4786-b8df-bd36cf019416.xml b/.idea/dataSources/f96628bb-436c-4786-b8df-bd36cf019416.xml new file mode 100644 index 0000000..507037d --- /dev/null +++ b/.idea/dataSources/f96628bb-436c-4786-b8df-bd36cf019416.xml @@ -0,0 +1,13 @@ + + + + + 4.4.1 + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..66971cb --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,20 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..a2e120d --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..1b8c282 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git "a/.idea/\347\237\245\344\271\216.iml" "b/.idea/\347\237\245\344\271\216.iml" new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ "b/.idea/\347\237\245\344\271\216.iml" @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8a4ee49be62ff4ce05b247ef4b478a854d8b04ab GIT binary patch literal 34 mcmezWPnki1A&Mb`Arpv88FU$f847^x6oyoWA_iUtE(QRYoCfp& literal 0 HcmV?d00001 diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..1a1bb22 --- /dev/null +++ b/__init__.py @@ -0,0 +1,3 @@ +""" + @desc 知乎爬虫 结构:话题广场--话题--提问--回答--评论 以及独立的用户信息 +""" \ No newline at end of file diff --git a/error_log.txt b/error_log.txt new file mode 100644 index 0000000..e69de29 diff --git a/frame/SpyderFrame.py b/frame/SpyderFrame.py new file mode 100644 index 0000000..edf2484 --- /dev/null +++ b/frame/SpyderFrame.py @@ -0,0 +1,233 @@ +""" + @version: v0.3 dev + @desc: 通用爬虫框架 + @update_log: v0.1 初始架构Proxies代理模块、UrlManager、HtmlDownloader、HtmlParser、DataSaver + v0.2 加入MongoDB存储功能,支持MongoDB自增ID + v0.3 加入Redis支持,UrlManager使用Redis运行大型项目可以断点续爬,DataSaver使用Redis解决硬盘I/O低影响爬虫速度 +""" +import threading +import pandas as pd +import requests +import time +import redis +import socket + +redis = redis.Redis() + + +# 代理线程 +class Proxies(threading.Thread): + + def __init__(self): + super().__init__() + # 线程运行标志 + self.__thread__flag = True + self.get_proxies_api = "http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId" \ + "=192b9425f13c47ffbbe4a663c974608b&orderno=YZ2020219595449Wzor&returnType=2&count=1 " + self.Proxies = { + "http": "", + "https": "" + } + + # 结束线程 + def __exit__(self): + self.__thread__flag = False + + # 如果代理失效,通知进程主动更新代理 + def get_proxies(self): + i = 0 + for i in range(5): + print("第" + str(i + 1) + "次获取代理。。。") + res = requests.get(self.get_proxies_api) + j = eval(res.text) + if j['ERRORCODE'] == '0': + self.Proxies['http'] = "http://" + j['RESULT'][0]['ip'] + ":" + j['RESULT'][0]['port'] + self.Proxies['https'] = "http://" + j['RESULT'][0]['ip'] + ":" + j['RESULT'][0]['port'] + return + time.sleep(1.2) + if i == 4: + print("获取代理失败,程序退出。。。") + exit(1) + + # 监测代理时间。如果超时更新代理 + def run(self) -> None: + start_time = time.time() + while self.__thread__flag: + # 设置代理生存时间为60s + if start_time - time.time() > 60: + # 重设代理使用时长 + start_time = time.time() + self.get_proxies() + time.sleep(3) + + +class UrlManager(object): + """url管理""" + + # 初始化url池 + def __init__(self, db_set_name='', use_redis=False): + """支持Redis队列解决断点续爬功能,需指定参数use_redis=True + + :param db_set_name str Redis队列数据库名,默认为空 + """ + self.use_redis = use_redis + self.db_set_name = db_set_name + if not use_redis: + self.url_list = [] + self.url_set = set() + return + + # 定义插入url方法 + def add_url(self, url: str) -> None: + if not self.use_redis: + if url not in self.url_set: + self.url_set.add(url) + self.url_list.append(url) + elif redis.sadd("set_" + self.db_set_name, url): # 如果插入成功,会返回数据量 + redis.rpush("list_" + self.db_set_name, url) # 列表尾部插入 + + # 从队列头部提取url + def get(self) -> str: + if not self.use_redis: + return self.url_list.pop(0) + return redis.lpop("list_" + self.db_set_name).decode("utf-8") # 列表头部pop + + # 队列还有URL吗 + def not_complete(self) -> bool: + if not self.use_redis and len(self.url_list): + return True + elif redis.llen("list_" + self.db_set_name) != 0: + return True + return False + + +# 页面资源下载 +class HtmlDownloader(threading.Thread): + + def __init__(self): + # 实例化Proxies类 + super().__init__() + self.proxies = Proxies() + # 启动代理线程 + self.proxies.start() + # 默认请求头 + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/85.0.4183.102 Safari/537.36 Edg/85.0.564.51 " + } + socket.setdefaulttimeout(10) # 设置超时 + + def download(self, url: str, params={}) -> str: + with open("../error_log.txt", "w") as err_log: + for i in range(3): + try: + res = requests.get(url, params=params, headers=self.headers, proxies=self.proxies.Proxies, timeout=3) + if res.status_code == 200: + return res.text + # 非200,更换代理,抛出异常 + self.proxies.get_proxies() + res.raise_for_status() + # 记录异常 + except requests.exceptions.HTTPError: + print(url + "; HTTPError; Code " + str(res.status_code)) + err_log.write(url + "; HTTPError; Code " + str(res.status_code)) + except requests.exceptions.Timeout: + print(url + "; Timeout") + err_log.write(url + "; Timeout") + except Exception: + print(url + "; Other Error") + err_log.write(url + "; Other Error") + self.proxies.get_proxies() + print("downloading error , retrying.....{},3", i+1) + raise requests.exceptions.RetryError + + +# html解析,需要在主函数中重写 +class HtmlParser(object): + def __init__(self): + return + + def html_parser(self): + return + + +class DataSaver(threading.Thread): + + def __init__(self, db_name='', set_name='', use_auto_increase_index=False, use_redis=False): + """若要使用Redis缓存数据,指定参数use_redis=True \n使用MongoDB自增ID,指定use_auto_increase_index=True + :param db_name: str 可选 要存储的MongoDB数据库名称 + :param set_name: str 可选 要存储的MongoDB集合名 + :func run: 采用run同步Redis与Mongo数据 + """ + super().__init__() + import pymongo + mg_client = pymongo.MongoClient("mongodb://localhost:27017/") + + self.db_name = db_name + self.set_name = set_name + self.use_auto_increase_index = use_auto_increase_index + self.__tread__flag = True + self.use_redis = use_redis + + self.mg_client_counter = mg_client["counter"] + self.mg_client_data = mg_client[db_name] + self.mg_data_db = self.mg_client_data[set_name] + self.mg_counter_db = self.mg_client_counter[db_name + "@" + set_name] + self.nextId = None + if use_auto_increase_index: # 使用自增ID + if db_name + "@" + set_name in self.mg_client_counter.list_collection_names(): + return + else: + self.mg_counter_db.insert({ + "_id": "_id", + "index": 0 + }) + + def __exit__(self): + self.__tread__flag = False + + # csv存储 + @staticmethod + def to_csv(data: list, file_name: str, encoding: str = "utf-8") -> None: + """存储到CSV + + :param data: dict in list 数据集 + :param file_name: str 文件路径 + :param encoding: default "utf-8" + + """ + pd.DataFrame(data).to_csv(file_name, encoding=encoding) + + # MongoDB自增ID + def getNextId(self) -> None: + self.nextId = self.mg_counter_db.find_one_and_update({"_id": '_id'}, {"$inc": {"index": 1}})['index'] + + def redis_temp(self, data_dict: dict) -> None: + """数据缓存到Redis 如果使用此函方法请确保实例化DataSaver时指定了use_redis=True + :param data_dict: dict 数据集合 + """ + # 有序集合 + redis.sadd("data_" + self.db_name + "@" + self.set_name, str(data_dict)) + + def mongo_insert(self, data_dict: dict) -> None: + """向MongoDB直接插入数据,不经过Redis缓存 + :param data_dict: dict 数据集合 + """ + if self.use_auto_increase_index: # 使用自增ID + self.getNextId() + data_dict.update({"_id": self.nextId}) + self.mg_data_db.insert(data_dict) + + def run(self): + """Redis缓存数据同步到MongoDB, 请在主程序结束后调用本对象的__exit__方法结束该线程""" + # 只有在redis缓存数据为空,并且主程序退出的时候才会结束 + while redis.scard("data_" + self.db_name + "@" + self.set_name) or self.__tread__flag: + data = redis.spop("data_" + self.db_name + "@" + self.set_name) + if data: + data = eval(data.decode("UTF-8")) + if self.use_auto_increase_index: # 使用自增ID + self.getNextId() + data.update({"_id": self.nextId}) + self.mg_data_db.insert(data) + # 没有数据,休息一会 + time.sleep(1) diff --git a/tools/KeyWordsSearch.py b/tools/KeyWordsSearch.py new file mode 100644 index 0000000..558d150 --- /dev/null +++ b/tools/KeyWordsSearch.py @@ -0,0 +1,161 @@ +from frame import SpyderFrame +import json + +KWD = '' + + +class HTMLParser(SpyderFrame.HtmlParser): + + def __init__(self, get_detail): + super().__init__() + self.get_detail = get_detail + if get_detail: + self.url_manager = SpyderFrame.UrlManager(db_set_name='知乎@' + KWD) + + def parser(self, data_list: list) -> dict: + for data in data_list: + _type = data['type'] + if _type == 'knowledge_ad': + yield self._knowledge_ad(data) + elif _type == "wiki_box": + yield self._wiki_box(data) + elif _type == 'search_result': + if data['object']['type'] == "answer": + yield self._search_result_answer(data) + else: + print(data) + elif _type == "relevant_query" or "multi_answers" or "search_club" or "video_box": + continue + else: + print(data) + + def _knowledge_ad(self, data): + self._find_new_url(data['object']['url']) + authors = data["object"]["body"]["authors"] + for i in range(len(authors)): + authors[i].pop("icon") + return { + "type": "knowledge_ad", + "id": data["id"], + "title": data["object"]["body"]["title"], + "authors": authors, + "description": data["object"]["body"]["description"], + # "commodity_type": data["object"]["body"]["commodity_type"], + "footer": data["object"]["footer"], + "url": data['object']['url'] + } + + def _search_result_answer(self, data): + self._find_new_url("https://www.zhihu.com/question/" + data['object']['question']['url'].split('/')[-1]) + return { + "id": data["object"]["id"], + "q_id": data["object"]["question"]["id"], + "type": "search_result_answer", + "author": data["object"]["author"], + "q_name": data["object"]["question"]["name"], + "content": data["object"]["content"], + "excerpt": data["object"]["excerpt"], + "created_time": data["object"]["created_time"], + "updated_time": data["object"]["updated_time"], + "comment_count": data["object"]["comment_count"], + "voteup_count": data["object"]["voteup_count"], + "q_url": "https://www.zhihu.com/question/" + data['object']['question']['url'].split('/')[-1] + } + + def _wiki_box(self, data): + data = data['object'] + self._find_new_url("https://www.zhihu.com/topic/" + data['url'].split('/')[-1]) + return { + "id": data["id"], + "aliases": data['aliases'], + "discussion_count": data["discussion_count"], + "essence_feed_count": data["essence_feed_count"], + "excerpt": data["excerpt"], + "follower_count": data["follower_count"], + "followers_count": data["followers_count"], + "introduction": data["introduction"], + "questions_count": data["questions_count"], + "top_answer_count": data["top_answer_count"], + "type": "wiki_box", + "url": "https://www.zhihu.com/topic/" + data['url'].split('/')[-1] + } + + def _find_new_url(self, url): + if self.get_detail: + self.url_manager.add_url(url) + return + + def _search_result_article(self, data): + return + + def _search_result_question(self, data): + return + + +def search(keyword): + global KWD + KWD = keyword + base_url = 'https://api.zhihu.com/search_v3' + html_downloader = SpyderFrame.HtmlDownloader() + data_saver = SpyderFrame.DataSaver(db_name='知乎', set_name=keyword) + html_downloader.headers = { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8," + "application/signed-exchange;v=b3;q=0.9", + "accept-encoding": "gzip, deflate, br", + "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", + "cache-control": "no-cache", + "cookie": "d_c0=\"AADWzpXyDxKPTkP_r53qvH9ipDf4dAG7XE4=|1603087870\"; " + "_zap=b47b3886-7c4a-4101-9ee5-4c803bcf6cd8; _xsrf=LRWrd8I0FyQr3hxZ49tYEABlJI0MFizY; " + "capsion_ticket=\"2|1:0|10:1603262862|14:capsion_ticket|44" + ":N2UxNWE4YzExZWYxNDUwYWFkZjM4MjQ4MDhjNWExNjY" + "=|fa44c3793ac9cf5fac96aab9dc9d8faadba2d384e00351c9c9642028ceace6ad\"; " + "r_cap_id=\"YmY4MWY5YzA0OWRlNDk0Yjk2MTEyYWEzZDU5MjZmMmM=|1603262864" + "|9dbd3b9caeccd1669c26ee92e5b543543a611713\"; " + "cap_id=\"OGVlYjJjOTQ2YTgyNGMzZTlmODk4NDUzMzQ0ZTkyNjA=|1603262864" + "|5e52e69215700dd4539d66e5a0833dd4a0c4c1fe\"; " + "l_cap_id=\"ODViMjY0YmExNWNlNGVmYWJmMGY5MGUyNTUzMjQxMzM=|1603262864" + "|8a107e67c1f9223cd88f066cda42b6ce2102b632\"; " + "z_c0=Mi4xQnNEUkNBQUFBQUFBQU5iT2xmSVBFaGNBQUFCaEFsVk5saWQ5WUFERVEzVUJpOVdzZHRZcnloaE9OZWVXVDZwTlhR" + "|1603262870|42b123d5ae8b1fb74a8815b13eae8cb34f92508c; tst=r; " + "q_c1=582f701a20454c59be03f2470d62b194|1603326280000|1603326280000; " + "Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1603653130,1603680022,1603682173,1603683176; " + "Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1603683176; " + "KLBRSID=af132c66e9ed2b57686ff5c489976b91|1603684342|1603684271", + "pragma": "no-cache", + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "none", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.51 " + } + prams = { + "advert_count": "0", + "correction": "1", + "lc_idx": "0", + "limit": "20", + "offset": "0", + "q": keyword, + "search_hash_id": "1e3c9a021028e71019c7977637948651", + "show_all_topics": "0", + "t": "general", + "vertical_info": "0,1,0,0,0,0,0,0,0,2" + } + html_parser = HTMLParser(get_detail=0) + res = html_downloader.download(url=base_url, params=prams) + while True: + res = json.loads(res) + for data in html_parser.parser(res['data']): + data_saver.mongo_insert(data) + if res['paging']['is_end']: + break + next_url = res['paging']['next'] + res = html_downloader.download(next_url) + # exit + html_downloader.proxies.__exit__() + + +if __name__ == '__main__': + kwd = input("请输入搜索关键词:") + search(kwd) diff --git a/utils/question.py b/utils/question.py new file mode 100644 index 0000000..6d982db --- /dev/null +++ b/utils/question.py @@ -0,0 +1,70 @@ +""" + @author 满目皆星河 + @creat_date 2020/10/06 + @update_data 2020/10/06 + @desc 知乎回答爬虫,爬某个提问下面全部的回答以及回答的详细信息, 存入MongoDB.知乎.questions + @main_function spyder(question_id: str) +""" + +import json +from frame import SpyderFrame + +URL_MANAGER = SpyderFrame.UrlManager() + + +class HtmlParser(SpyderFrame.HtmlParser): + + @staticmethod + def question_json_parser(question_text: str) -> list: + # 格式化,str转字典 + question_json = json.loads(question_text) + + # 游标,下个数据包URL + if not question_json["paging"]["is_end"]: + next_url = question_json["paging"]["next"] + URL_MANAGER.add_url(next_url) + + # 解析json里的data数据包 + data_results = question_json["data"] + for i in range(len(data_results)): + # 修正ID格式,为MongoDB的索引格式 + data_results[i].update({"_id": data_results[i].pop("id")}) + yield data_results[i] + + +def spyder(question_id: str): + """ + :input str in list 列表内嵌套字符串 + """ + base_url_start = "https://www.zhihu.com/api/v4/questions/" + base_url_end = "/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed" \ + "%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by" \ + "%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count" \ + "%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info" \ + "%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting" \ + "%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B" \ + "%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics" \ + "&limit=5&offset=0" + html_parser = HtmlParser() + html_downloader = SpyderFrame.HtmlDownloader() + data_saver = SpyderFrame.DataSaver(db_name="知乎", set_name="questions") + + # 初始化URL队列 + URL_MANAGER.add_url(url=base_url_start + question_id + base_url_end) + + while URL_MANAGER.not_complete(): + url = URL_MANAGER.get() + for data in html_parser.question_json_parser(html_downloader.download(url)): + data_saver.mongo_insert(data_dict=data) + + # 结束线程 + html_downloader.proxies.__exit__() + + +if __name__ == '__main__': + question_list = [] + for question in input("请输入问题ID,多个question_id请用英文逗号分隔:").split(","): + question_list.append(question.strip()) + + for question in question_list: + spyder(question) diff --git a/utils/user.py b/utils/user.py new file mode 100644 index 0000000..0d69c36 --- /dev/null +++ b/utils/user.py @@ -0,0 +1,17 @@ +from frame import SpyderFrame + + +class JsonParser(SpyderFrame.HtmlParser): + def __init__(self): + super.__init__() + + # def user_answered(self): + + +if __name__ == '__main__': + USER_BASE_URL = "https://www.zhihu.com/people/" + JSON_PARSER = JsonParser() + URL_MANAGER = SpyderFrame.UrlManager("zhihu") + URL_MANAGER.add_url(USER_BASE_URL + "dear-w-34") + HTML_DOWNLOADER = SpyderFrame.HtmlDownloader() + DATA_SAVER = SpyderFrame.DataSaver("知乎", "user") diff --git a/utils/wiki_box.py b/utils/wiki_box.py new file mode 100644 index 0000000..0b04d81 --- /dev/null +++ b/utils/wiki_box.py @@ -0,0 +1,16 @@ +""" + @author 满目皆星河 + @creat_date 2020/10/21 + @update_data 2020/10/21 + @desc 知乎百科数据,知乎某些question被收录进知乎百科里,这里仅保留百科词条,question具体信息有专门处理的文件(utils/question.py) + @main_function spyder(question_id: str) +""" + + +def parser(text: str): + """ + 解析wiki_box内容 + :param text: html数据 + :return data: wiki处理完成的json数据 + """ + \ No newline at end of file