From 4375f71c3675b69187b43c5181668f183331d841 Mon Sep 17 00:00:00 2001
From: HanZhuoii <e.hanzhuo@gmail.com>
Date: Tue, 27 Oct 2020 00:39:43 +0800
Subject: [PATCH] Building

---
 .idea/.gitignore                              |   8 +
 .../f96628bb-436c-4786-b8df-bd36cf019416.xml  |  13 +
 .idea/inspectionProfiles/Project_Default.xml  |  20 ++
 .../inspectionProfiles/profiles_settings.xml  |   6 +
 .idea/misc.xml                                |   4 +
 .idea/modules.xml                             |   8 +
 ".idea/\347\237\245\344\271\216.iml"          |   8 +
 README.md                                     | Bin 0 -> 34 bytes
 __init__.py                                   |   3 +
 error_log.txt                                 |   0
 frame/SpyderFrame.py                          | 233 ++++++++++++++++++
 tools/KeyWordsSearch.py                       | 161 ++++++++++++
 utils/question.py                             |  70 ++++++
 utils/user.py                                 |  17 ++
 utils/wiki_box.py                             |  16 ++
 15 files changed, 567 insertions(+)
 create mode 100644 .idea/.gitignore
 create mode 100644 .idea/dataSources/f96628bb-436c-4786-b8df-bd36cf019416.xml
 create mode 100644 .idea/inspectionProfiles/Project_Default.xml
 create mode 100644 .idea/inspectionProfiles/profiles_settings.xml
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/modules.xml
 create mode 100644 ".idea/\347\237\245\344\271\216.iml"
 create mode 100644 README.md
 create mode 100644 __init__.py
 create mode 100644 error_log.txt
 create mode 100644 frame/SpyderFrame.py
 create mode 100644 tools/KeyWordsSearch.py
 create mode 100644 utils/question.py
 create mode 100644 utils/user.py
 create mode 100644 utils/wiki_box.py
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..af7c512
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/../../../../:\PycharmProjects\知乎\.idea/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
diff --git a/.idea/dataSources/f96628bb-436c-4786-b8df-bd36cf019416.xml b/.idea/dataSources/f96628bb-436c-4786-b8df-bd36cf019416.xml
new file mode 100644
index 0000000..507037d
--- /dev/null
+++ b/.idea/dataSources/f96628bb-436c-4786-b8df-bd36cf019416.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<dataSource name="@localhost">
+  <database-model serializer="dbm" dbms="MONGO" family-id="MONGO" format-version="4.19">
+    <root id="1">
+      <ServerVersion>4.4.1</ServerVersion>
+    </root>
+    <schema id="2" parent="1" name="admin"/>
+    <schema id="3" parent="1" name="config"/>
+    <schema id="4" parent="1" name="counter"/>
+    <schema id="5" parent="1" name="local"/>
+    <schema id="6" parent="1" name="猫眼"/>
+  </database-model>
+</dataSource>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..66971cb
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,20 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyBroadExceptionInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N802" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredIdentifiers">
+        <list>
+          <option value="bs4.element.PageElement.text" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..a2e120d
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..1b8c282
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/知乎.iml" filepath="$PROJECT_DIR$/.idea/知乎.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git "a/.idea/\347\237\245\344\271\216.iml" "b/.idea/\347\237\245\344\271\216.iml"
new file mode 100644
index 0000000..d0876a7
--- /dev/null
+++ "b/.idea/\347\237\245\344\271\216.iml"
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8a4ee49be62ff4ce05b247ef4b478a854d8b04ab
GIT binary patch
literal 34
mcmezWPnki1A&Mb`Arpv88FU$f847^x6oyoWA_iUtE(QRYoCfp&

literal 0
HcmV?d00001

diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..1a1bb22
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,3 @@
+"""
+    @desc 知乎爬虫 结构：话题广场--话题--提问--回答--评论 以及独立的用户信息
+"""
\ No newline at end of file
diff --git a/error_log.txt b/error_log.txt
new file mode 100644
index 0000000..e69de29
diff --git a/frame/SpyderFrame.py b/frame/SpyderFrame.py
new file mode 100644
index 0000000..edf2484
--- /dev/null
+++ b/frame/SpyderFrame.py
@@ -0,0 +1,233 @@
+"""
+    @version: v0.3 dev
+    @desc: 通用爬虫框架
+    @update_log: v0.1 初始架构Proxies代理模块、UrlManager、HtmlDownloader、HtmlParser、DataSaver
+                 v0.2 加入MongoDB存储功能，支持MongoDB自增ID
+                 v0.3 加入Redis支持，UrlManager使用Redis运行大型项目可以断点续爬，DataSaver使用Redis解决硬盘I/O低影响爬虫速度
+"""
+import threading
+import pandas as pd
+import requests
+import time
+import redis
+import socket
+
+redis = redis.Redis()
+
+
+# 代理线程
+class Proxies(threading.Thread):
+
+    def __init__(self):
+        super().__init__()
+        # 线程运行标志
+        self.__thread__flag = True
+        self.get_proxies_api = "http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId" \
+                               "=192b9425f13c47ffbbe4a663c974608b&orderno=YZ2020219595449Wzor&returnType=2&count=1 "
+        self.Proxies = {
+            "http": "",
+            "https": ""
+        }
+
+    # 结束线程
+    def __exit__(self):
+        self.__thread__flag = False
+
+    # 如果代理失效，通知进程主动更新代理
+    def get_proxies(self):
+        i = 0
+        for i in range(5):
+            print("第" + str(i + 1) + "次获取代理。。。")
+            res = requests.get(self.get_proxies_api)
+            j = eval(res.text)
+            if j['ERRORCODE'] == '0':
+                self.Proxies['http'] = "http://" + j['RESULT'][0]['ip'] + ":" + j['RESULT'][0]['port']
+                self.Proxies['https'] = "http://" + j['RESULT'][0]['ip'] + ":" + j['RESULT'][0]['port']
+                return
+            time.sleep(1.2)
+        if i == 4:
+            print("获取代理失败，程序退出。。。")
+            exit(1)
+
+    # 监测代理时间。如果超时更新代理
+    def run(self) -> None:
+        start_time = time.time()
+        while self.__thread__flag:
+            # 设置代理生存时间为60s
+            if start_time - time.time() > 60:
+                # 重设代理使用时长
+                start_time = time.time()
+                self.get_proxies()
+            time.sleep(3)
+
+
+class UrlManager(object):
+    """url管理"""
+
+    # 初始化url池
+    def __init__(self, db_set_name='', use_redis=False):
+        """支持Redis队列解决断点续爬功能，需指定参数use_redis=True
+
+        :param db_set_name str Redis队列数据库名，默认为空
+        """
+        self.use_redis = use_redis
+        self.db_set_name = db_set_name
+        if not use_redis:
+            self.url_list = []
+            self.url_set = set()
+        return
+
+    # 定义插入url方法
+    def add_url(self, url: str) -> None:
+        if not self.use_redis:
+            if url not in self.url_set:
+                self.url_set.add(url)
+                self.url_list.append(url)
+        elif redis.sadd("set_" + self.db_set_name, url):  # 如果插入成功，会返回数据量
+            redis.rpush("list_" + self.db_set_name, url)  # 列表尾部插入
+
+    # 从队列头部提取url
+    def get(self) -> str:
+        if not self.use_redis:
+            return self.url_list.pop(0)
+        return redis.lpop("list_" + self.db_set_name).decode("utf-8")  # 列表头部pop
+
+    # 队列还有URL吗
+    def not_complete(self) -> bool:
+        if not self.use_redis and len(self.url_list):
+            return True
+        elif redis.llen("list_" + self.db_set_name) != 0:
+            return True
+        return False
+
+
+# 页面资源下载
+class HtmlDownloader(threading.Thread):
+
+    def __init__(self):
+        # 实例化Proxies类
+        super().__init__()
+        self.proxies = Proxies()
+        # 启动代理线程
+        self.proxies.start()
+        # 默认请求头
+        self.headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
+                          "Chrome/85.0.4183.102 Safari/537.36 Edg/85.0.564.51 "
+        }
+        socket.setdefaulttimeout(10)    # 设置超时
+
+    def download(self, url: str, params={}) -> str:
+        with open("../error_log.txt", "w") as err_log:
+            for i in range(3):
+                try:
+                    res = requests.get(url, params=params, headers=self.headers, proxies=self.proxies.Proxies, timeout=3)
+                    if res.status_code == 200:
+                        return res.text
+                    # 非200，更换代理，抛出异常
+                    self.proxies.get_proxies()
+                    res.raise_for_status()
+                # 记录异常
+                except requests.exceptions.HTTPError:
+                    print(url + "; HTTPError; Code " + str(res.status_code))
+                    err_log.write(url + "; HTTPError; Code " + str(res.status_code))
+                except requests.exceptions.Timeout:
+                    print(url + "; Timeout")
+                    err_log.write(url + "; Timeout")
+                except Exception:
+                    print(url + "; Other Error")
+                    err_log.write(url + "; Other Error")
+                self.proxies.get_proxies()
+                print("downloading error , retrying.....{},3", i+1)
+            raise requests.exceptions.RetryError
+
+
+# html解析，需要在主函数中重写
+class HtmlParser(object):
+    def __init__(self):
+        return
+
+    def html_parser(self):
+        return
+
+
+class DataSaver(threading.Thread):
+
+    def __init__(self, db_name='', set_name='', use_auto_increase_index=False, use_redis=False):
+        """若要使用Redis缓存数据，指定参数use_redis=True \n使用MongoDB自增ID，指定use_auto_increase_index=True
+        :param db_name: str 可选 要存储的MongoDB数据库名称
+        :param set_name: str 可选 要存储的MongoDB集合名
+        :func run: 采用run同步Redis与Mongo数据
+        """
+        super().__init__()
+        import pymongo
+        mg_client = pymongo.MongoClient("mongodb://localhost:27017/")
+
+        self.db_name = db_name
+        self.set_name = set_name
+        self.use_auto_increase_index = use_auto_increase_index
+        self.__tread__flag = True
+        self.use_redis = use_redis
+
+        self.mg_client_counter = mg_client["counter"]
+        self.mg_client_data = mg_client[db_name]
+        self.mg_data_db = self.mg_client_data[set_name]
+        self.mg_counter_db = self.mg_client_counter[db_name + "@" + set_name]
+        self.nextId = None
+        if use_auto_increase_index:  # 使用自增ID
+            if db_name + "@" + set_name in self.mg_client_counter.list_collection_names():
+                return
+            else:
+                self.mg_counter_db.insert({
+                    "_id": "_id",
+                    "index": 0
+                })
+
+    def __exit__(self):
+        self.__tread__flag = False
+
+    # csv存储
+    @staticmethod
+    def to_csv(data: list, file_name: str, encoding: str = "utf-8") -> None:
+        """存储到CSV
+
+        :param data: dict in list 数据集
+        :param file_name: str 文件路径
+        :param encoding: default "utf-8"
+
+        """
+        pd.DataFrame(data).to_csv(file_name, encoding=encoding)
+
+    # MongoDB自增ID
+    def getNextId(self) -> None:
+        self.nextId = self.mg_counter_db.find_one_and_update({"_id": '_id'}, {"$inc": {"index": 1}})['index']
+
+    def redis_temp(self, data_dict: dict) -> None:
+        """数据缓存到Redis 如果使用此函方法请确保实例化DataSaver时指定了use_redis=True
+        :param data_dict: dict 数据集合
+        """
+        # 有序集合
+        redis.sadd("data_" + self.db_name + "@" + self.set_name, str(data_dict))
+
+    def mongo_insert(self, data_dict: dict) -> None:
+        """向MongoDB直接插入数据，不经过Redis缓存
+        :param data_dict: dict 数据集合
+        """
+        if self.use_auto_increase_index:  # 使用自增ID
+            self.getNextId()
+            data_dict.update({"_id": self.nextId})
+        self.mg_data_db.insert(data_dict)
+
+    def run(self):
+        """Redis缓存数据同步到MongoDB, 请在主程序结束后调用本对象的__exit__方法结束该线程"""
+        # 只有在redis缓存数据为空，并且主程序退出的时候才会结束
+        while redis.scard("data_" + self.db_name + "@" + self.set_name) or self.__tread__flag:
+            data = redis.spop("data_" + self.db_name + "@" + self.set_name)
+            if data:
+                data = eval(data.decode("UTF-8"))
+                if self.use_auto_increase_index:  # 使用自增ID
+                    self.getNextId()
+                    data.update({"_id": self.nextId})
+                self.mg_data_db.insert(data)
+            # 没有数据，休息一会
+            time.sleep(1)
diff --git a/tools/KeyWordsSearch.py b/tools/KeyWordsSearch.py
new file mode 100644
index 0000000..558d150
--- /dev/null
+++ b/tools/KeyWordsSearch.py
@@ -0,0 +1,161 @@
+from frame import SpyderFrame
+import json
+
+KWD = ''
+
+
+class HTMLParser(SpyderFrame.HtmlParser):
+
+    def __init__(self, get_detail):
+        super().__init__()
+        self.get_detail = get_detail
+        if get_detail:
+            self.url_manager = SpyderFrame.UrlManager(db_set_name='知乎@' + KWD)
+
+    def parser(self, data_list: list) -> dict:
+        for data in data_list:
+            _type = data['type']
+            if _type == 'knowledge_ad':
+                yield self._knowledge_ad(data)
+            elif _type == "wiki_box":
+                yield self._wiki_box(data)
+            elif _type == 'search_result':
+                if data['object']['type'] == "answer":
+                    yield self._search_result_answer(data)
+                else:
+                    print(data)
+            elif _type == "relevant_query" or "multi_answers" or "search_club" or "video_box":
+                continue
+            else:
+                print(data)
+
+    def _knowledge_ad(self, data):
+        self._find_new_url(data['object']['url'])
+        authors = data["object"]["body"]["authors"]
+        for i in range(len(authors)):
+            authors[i].pop("icon")
+        return {
+            "type": "knowledge_ad",
+            "id": data["id"],
+            "title": data["object"]["body"]["title"],
+            "authors": authors,
+            "description": data["object"]["body"]["description"],
+            # "commodity_type": data["object"]["body"]["commodity_type"],
+            "footer": data["object"]["footer"],
+            "url": data['object']['url']
+        }
+
+    def _search_result_answer(self, data):
+        self._find_new_url("https://www.zhihu.com/question/" + data['object']['question']['url'].split('/')[-1])
+        return {
+            "id": data["object"]["id"],
+            "q_id": data["object"]["question"]["id"],
+            "type": "search_result_answer",
+            "author": data["object"]["author"],
+            "q_name": data["object"]["question"]["name"],
+            "content": data["object"]["content"],
+            "excerpt": data["object"]["excerpt"],
+            "created_time": data["object"]["created_time"],
+            "updated_time": data["object"]["updated_time"],
+            "comment_count": data["object"]["comment_count"],
+            "voteup_count": data["object"]["voteup_count"],
+            "q_url": "https://www.zhihu.com/question/" + data['object']['question']['url'].split('/')[-1]
+        }
+
+    def _wiki_box(self, data):
+        data = data['object']
+        self._find_new_url("https://www.zhihu.com/topic/" + data['url'].split('/')[-1])
+        return {
+            "id": data["id"],
+            "aliases": data['aliases'],
+            "discussion_count": data["discussion_count"],
+            "essence_feed_count": data["essence_feed_count"],
+            "excerpt": data["excerpt"],
+            "follower_count": data["follower_count"],
+            "followers_count": data["followers_count"],
+            "introduction": data["introduction"],
+            "questions_count": data["questions_count"],
+            "top_answer_count": data["top_answer_count"],
+            "type": "wiki_box",
+            "url": "https://www.zhihu.com/topic/" + data['url'].split('/')[-1]
+        }
+
+    def _find_new_url(self, url):
+        if self.get_detail:
+            self.url_manager.add_url(url)
+        return
+
+    def _search_result_article(self, data):
+        return
+
+    def _search_result_question(self, data):
+        return
+
+
+def search(keyword):
+    global KWD
+    KWD = keyword
+    base_url = 'https://api.zhihu.com/search_v3'
+    html_downloader = SpyderFrame.HtmlDownloader()
+    data_saver = SpyderFrame.DataSaver(db_name='知乎', set_name=keyword)
+    html_downloader.headers = {
+        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,"
+                  "application/signed-exchange;v=b3;q=0.9",
+        "accept-encoding": "gzip, deflate, br",
+        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
+        "cache-control": "no-cache",
+        "cookie": "d_c0=\"AADWzpXyDxKPTkP_r53qvH9ipDf4dAG7XE4=|1603087870\"; "
+                  "_zap=b47b3886-7c4a-4101-9ee5-4c803bcf6cd8; _xsrf=LRWrd8I0FyQr3hxZ49tYEABlJI0MFizY; "
+                  "capsion_ticket=\"2|1:0|10:1603262862|14:capsion_ticket|44"
+                  ":N2UxNWE4YzExZWYxNDUwYWFkZjM4MjQ4MDhjNWExNjY"
+                  "=|fa44c3793ac9cf5fac96aab9dc9d8faadba2d384e00351c9c9642028ceace6ad\"; "
+                  "r_cap_id=\"YmY4MWY5YzA0OWRlNDk0Yjk2MTEyYWEzZDU5MjZmMmM=|1603262864"
+                  "|9dbd3b9caeccd1669c26ee92e5b543543a611713\"; "
+                  "cap_id=\"OGVlYjJjOTQ2YTgyNGMzZTlmODk4NDUzMzQ0ZTkyNjA=|1603262864"
+                  "|5e52e69215700dd4539d66e5a0833dd4a0c4c1fe\"; "
+                  "l_cap_id=\"ODViMjY0YmExNWNlNGVmYWJmMGY5MGUyNTUzMjQxMzM=|1603262864"
+                  "|8a107e67c1f9223cd88f066cda42b6ce2102b632\"; "
+                  "z_c0=Mi4xQnNEUkNBQUFBQUFBQU5iT2xmSVBFaGNBQUFCaEFsVk5saWQ5WUFERVEzVUJpOVdzZHRZcnloaE9OZWVXVDZwTlhR"
+                  "|1603262870|42b123d5ae8b1fb74a8815b13eae8cb34f92508c; tst=r; "
+                  "q_c1=582f701a20454c59be03f2470d62b194|1603326280000|1603326280000; "
+                  "Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1603653130,1603680022,1603682173,1603683176; "
+                  "Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1603683176; "
+                  "KLBRSID=af132c66e9ed2b57686ff5c489976b91|1603684342|1603684271",
+        "pragma": "no-cache",
+        "sec-fetch-dest": "document",
+        "sec-fetch-mode": "navigate",
+        "sec-fetch-site": "none",
+        "sec-fetch-user": "?1",
+        "upgrade-insecure-requests": "1",
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
+                      "Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.51 "
+    }
+    prams = {
+        "advert_count": "0",
+        "correction": "1",
+        "lc_idx": "0",
+        "limit": "20",
+        "offset": "0",
+        "q": keyword,
+        "search_hash_id": "1e3c9a021028e71019c7977637948651",
+        "show_all_topics": "0",
+        "t": "general",
+        "vertical_info": "0,1,0,0,0,0,0,0,0,2"
+    }
+    html_parser = HTMLParser(get_detail=0)
+    res = html_downloader.download(url=base_url, params=prams)
+    while True:
+        res = json.loads(res)
+        for data in html_parser.parser(res['data']):
+            data_saver.mongo_insert(data)
+        if res['paging']['is_end']:
+            break
+        next_url = res['paging']['next']
+        res = html_downloader.download(next_url)
+    # exit
+    html_downloader.proxies.__exit__()
+
+
+if __name__ == '__main__':
+    kwd = input("请输入搜索关键词：")
+    search(kwd)
diff --git a/utils/question.py b/utils/question.py
new file mode 100644
index 0000000..6d982db
--- /dev/null
+++ b/utils/question.py
@@ -0,0 +1,70 @@
+"""
+    @author         满目皆星河
+    @creat_date     2020/10/06
+    @update_data    2020/10/06
+    @desc           知乎回答爬虫，爬某个提问下面全部的回答以及回答的详细信息, 存入MongoDB.知乎.questions
+    @main_function  spyder(question_id: str)
+"""
+
+import json
+from frame import SpyderFrame
+
+URL_MANAGER = SpyderFrame.UrlManager()
+
+
+class HtmlParser(SpyderFrame.HtmlParser):
+
+    @staticmethod
+    def question_json_parser(question_text: str) -> list:
+        # 格式化，str转字典
+        question_json = json.loads(question_text)
+
+        # 游标，下个数据包URL
+        if not question_json["paging"]["is_end"]:
+            next_url = question_json["paging"]["next"]
+            URL_MANAGER.add_url(next_url)
+
+        # 解析json里的data数据包
+        data_results = question_json["data"]
+        for i in range(len(data_results)):
+            # 修正ID格式，为MongoDB的索引格式
+            data_results[i].update({"_id": data_results[i].pop("id")})
+            yield data_results[i]
+
+
+def spyder(question_id: str):
+    """
+        :input str in list 列表内嵌套字符串
+    """
+    base_url_start = "https://www.zhihu.com/api/v4/questions/"
+    base_url_end = "/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed" \
+                   "%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by" \
+                   "%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count" \
+                   "%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info" \
+                   "%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting" \
+                   "%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B" \
+                   "%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics" \
+                   "&limit=5&offset=0"
+    html_parser = HtmlParser()
+    html_downloader = SpyderFrame.HtmlDownloader()
+    data_saver = SpyderFrame.DataSaver(db_name="知乎", set_name="questions")
+
+    # 初始化URL队列
+    URL_MANAGER.add_url(url=base_url_start + question_id + base_url_end)
+
+    while URL_MANAGER.not_complete():
+        url = URL_MANAGER.get()
+        for data in html_parser.question_json_parser(html_downloader.download(url)):
+            data_saver.mongo_insert(data_dict=data)
+
+    # 结束线程
+    html_downloader.proxies.__exit__()
+
+
+if __name__ == '__main__':
+    question_list = []
+    for question in input("请输入问题ID，多个question_id请用英文逗号分隔：").split(","):
+        question_list.append(question.strip())
+
+    for question in question_list:
+        spyder(question)
diff --git a/utils/user.py b/utils/user.py
new file mode 100644
index 0000000..0d69c36
--- /dev/null
+++ b/utils/user.py
@@ -0,0 +1,17 @@
+from frame import SpyderFrame
+
+
+class JsonParser(SpyderFrame.HtmlParser):
+    def __init__(self):
+        super.__init__()
+
+    # def user_answered(self):
+
+
+if __name__ == '__main__':
+    USER_BASE_URL = "https://www.zhihu.com/people/"
+    JSON_PARSER = JsonParser()
+    URL_MANAGER = SpyderFrame.UrlManager("zhihu")
+    URL_MANAGER.add_url(USER_BASE_URL + "dear-w-34")
+    HTML_DOWNLOADER = SpyderFrame.HtmlDownloader()
+    DATA_SAVER = SpyderFrame.DataSaver("知乎", "user")
diff --git a/utils/wiki_box.py b/utils/wiki_box.py
new file mode 100644
index 0000000..0b04d81
--- /dev/null
+++ b/utils/wiki_box.py
@@ -0,0 +1,16 @@
+"""
+    @author         满目皆星河
+    @creat_date     2020/10/21
+    @update_data    2020/10/21
+    @desc           知乎百科数据，知乎某些question被收录进知乎百科里，这里仅保留百科词条，question具体信息有专门处理的文件（utils/question.py）
+    @main_function  spyder(question_id: str)
+"""
+
+
+def parser(text: str):
+    """
+    解析wiki_box内容
+    :param text: html数据
+    :return data: wiki处理完成的json数据
+    """
+    
\ No newline at end of file