diff --git a/config.py b/config.py index 269ffb0..0dbe041 100644 --- a/config.py +++ b/config.py @@ -1,10 +1,14 @@ # - global - DB_NAME = "知乎" + USE_REDIS = True # 如果设置为False,后面的*_ADD_*_ID都将失效 REDIS_HOST = "192.168.100.101" REDIS_PORT = 6379 REDIS_PASSWORD = None + MONGO_CONNECTION = "mongodb://localhost:27017/" +MONGO_DOC_LIMIT = 5000 # 每个文档最多存储5000条data + PROXIES_LIVE_TIME = 60 REQUEST_RETRY_TIMES = 5 SOCKET_DEFAULT_TIMEOUT = 20 # 整个request生命周期 diff --git a/frame/SpiderFrame.py b/frame/SpiderFrame.py index 5109d7f..76ebd2d 100644 --- a/frame/SpiderFrame.py +++ b/frame/SpiderFrame.py @@ -6,19 +6,29 @@ v0.3 加入Redis支持,UrlManager使用Redis运行大型项目可以断点续爬,DataSaver使用Redis解决硬盘I/O低影响爬虫速度 """ import threading +import random + import pandas as pd import requests import redis import socket import logging import time + import config from os import path redis = redis.Redis(host=config.REDIS_HOST, port=config.REDIS_PORT, password=config.REDIS_PASSWORD) -class exception(Exception): +class exception: + class RequestRetryError(Exception): + def __init__(self): + super().__init__() + + def __str__(self): + return "发起过多次失败的Requests请求" + class UserNotExist(Exception): def __init__(self): super().__init__() @@ -84,12 +94,12 @@ def __init__(self): super().__init__() # 线程运行标志 self.__thread__flag = True - self.get_proxies_api = "http://api.xdaili.cn/xdaili-api/greatRecharge/getGreatIp?spiderId" \ - "=192b9425f13c47ffbbe4a663c974608b&orderno=YZ2020219595449Wzor&returnType=2&count=1 " + self.get_proxies_api = "http://webapi.http.zhimacangku.com/getip?num=1&type=2&pro=0&city=0&yys=0&port=11&pack=125417&ts=1&ys=0&cs=0&lb=1&sb=0&pb=45&mr=2®ions=110000,130000,140000,310000,320000,330000,340000,350000,360000,370000,410000,420000,430000,440000,500000,510000,610000" self.Proxies = { "http": "", "https": "" } + self.live_time = config.PROXIES_LIVE_TIME # 结束线程 def __exit__(self): @@ -101,23 +111,24 @@ def get_proxies(self): i = 0 for i in range(config.REQUEST_RETRY_TIMES): res = requests.get(self.get_proxies_api) - j = eval(res.text) - if j['ERRORCODE'] == '0': - self.Proxies['http'] = "http://" + j['RESULT'][0]['ip'] + ":" + j['RESULT'][0]['port'] - self.Proxies['https'] = "http://" + j['RESULT'][0]['ip'] + ":" + j['RESULT'][0]['port'] + j = eval(res.text.replace("true", "True").replace("false", "False").replace("null", "'null'")) + if j['code'] == 0: + self.Proxies['http'] = "http://" + j['data'][0]['ip'] + ":" + str(j['data'][0]['port']) + self.Proxies['https'] = "https://" + j['data'][0]['ip'] + ":" + str(j['data'][0]['port']) + self.live_time = int(time.mktime(time.strptime(j["data"][0]["expire_time"], "%Y-%m-%d %H:%M:%S"))) - time.time() logger.info("Successfully get proxies") return logger.warning("Failed, " + str(i + 1) + " times get proxies...") - time.sleep(5) + time.sleep(random.randrange(7, 13)) if i == 4: logger.critical("Get proxies failed, exit program...") # 监测代理时间。如果超时更新代理 def run(self) -> None: start_time = time.time() + self.get_proxies() while self.__thread__flag: - # 设置代理生存时间为60s - if start_time - time.time() > config.PROXIES_LIVE_TIME: + if time.time() - start_time > self.live_time: logger.warning("proxies failure, get new one") # 重设代理使用时长 start_time = time.time() @@ -204,7 +215,6 @@ def __init__(self): def download(self, url: str, params=None) -> str: if url == "": raise exception.UrlEmptyException - res = '' # 没啥用,消除警告而已 if params is None: params = {} for i in range(config.REQUEST_RETRY_TIMES): @@ -213,20 +223,25 @@ def download(self, url: str, params=None) -> str: timeout=3) if res.status_code == 200: return res.text - # 非200,更换代理,抛出异常 - self.proxies.get_proxies() res.raise_for_status() # 记录异常 except requests.exceptions.HTTPError: - logger.error(u"HTTPError; Code {0}[{1}]".format(str(res.status_code), url)) + logger.warning( + "HTTPError with url:<{0}> retrying.....{1},{2}".format(url[:25] + " ... " + url[-15:], i + 1, + config.REQUEST_RETRY_TIMES)) except requests.exceptions.Timeout: - logger.error(url + "; Timeout") + logger.warning( + "Timeout with url:<{0}> retrying.....{1},{2}".format(url[:25] + " ... " + url[-15:], i + 1, + config.REQUEST_RETRY_TIMES)) + except requests.exceptions.ProxyError: + self.proxies.get_proxies() + logger.error("Cannot connect to proxy.', timeout('timed out')", exc_info=True) + time.sleep(10) except Exception: - logger.error("Undefined Error [{0}]".format(url)) - # 请求失败,更换代理,重试 - self.proxies.get_proxies() - logger.warning("downloading error , retrying.....{0},3".format(i + 1)) + logger.error("Undefined Error [{0}]".format(url), exc_info=True) + self.proxies.get_proxies() logger.critical("requests.exceptions.RetryError [{0}]".format(url), exc_info=True) + time.sleep(10) raise requests.exceptions.RetryError def img_download(self, dir_path: str, url: str) -> None: diff --git a/main.py b/main.py index 7d4df9e..1cf9fa2 100644 --- a/main.py +++ b/main.py @@ -40,7 +40,7 @@ def run(self): except Exception as e: # 之前的报错信息已被记录 logger.critical("Unexpected Exit TopicSpider: {0}, Message: {1}".format(_id, e), exc_info=True) - send_mail("TopicSpider发生意料之外的错误,已退出线程") + # send_mail("TopicSpider发生意料之外的错误,已退出线程") finally: logger.warning("TopicSpider finished with exit code 0") topic.html_downloader.proxies.__exit__() @@ -50,14 +50,20 @@ class QuestionSpider(Thread): def __init__(self): logger.info("QuestionSpider init...") super().__init__() + + self.flag = True # url_manager方法已经内置,只需要使用id_manager传入ID参数即可 self.id_manager = SpiderFrame.UrlManager(db_set_name=config.QUESTION_ID_SET, use_redis=config.USE_REDIS) + def __exit__(self): + logger.warning("强制终止线程: QuestionSpider") + self.flag = False + def run(self): logger.info("QuestionSpider thread start...") _id = '' try: - while True: + while self.flag: if self.id_manager.list_not_null(): _id = self.id_manager.get() question.spider(_id) @@ -66,7 +72,7 @@ def run(self): except Exception as e: # 之前的报错信息已被记录 logger.critical("Unexpected Exit QuestionSpider: {0}, Message: {1}".format(_id, e), exc_info=True) - send_mail("QuestionSpider发生意料之外的错误,已退出线程") + # send_mail("QuestionSpider发生意料之外的错误,已退出线程") finally: logger.warning("QuestionSpider finished with exit code 0") question.html_downloader.proxies.__exit__() @@ -76,9 +82,15 @@ class CommentSpider(Thread): def __init__(self): logger.info("CommentSpider init...") super().__init__() + + self.flag = True # url_manager方法已经内置,只需要使用id_manager传入ID参数即可 self.id_manager = SpiderFrame.UrlManager(db_set_name=config.ANSWER_ID_SET, use_redis=config.USE_REDIS) + def __exit__(self): + logger.warning("强制终止线程: CommentSpider") + self.flag = False + def run(self): _id = '' try: @@ -92,7 +104,7 @@ def run(self): except Exception as e: # 之前的报错信息已被记录 logger.critical("Unexpected Exit CommentSpider: {0}, Message: {1}".format(_id, e), exc_info=True) - send_mail("CommentSpider发生意料之外的错误,已退出线程") + # send_mail("CommentSpider发生意料之外的错误,已退出线程") finally: logger.warning("CommentSpider finished with exit code 0") comment.html_downloader.proxies.__exit__() @@ -102,9 +114,15 @@ class UserSpider(Thread): def __init__(self): logger.info("UserSpider init...") super().__init__() + + self.flag = True # url_manager方法已经内置,只需要使用id_manager传入ID参数即可 self.id_manager = SpiderFrame.UrlManager(db_set_name=config.USER_ID_SET, use_redis=config.USE_REDIS) + def __exit__(self): + logger.warning("强制终止线程: UserSpider") + self.flag = False + def run(self): logger.info("UserSpider thread start...") _id = '' @@ -118,12 +136,13 @@ def run(self): except Exception as e: # 之前的报错信息已被记录 logger.critical("Unexpected Exit UserSpider: {0}, Message: {1}".format(_id, e), exc_info=True) - send_mail("UserSpider发生意料之外的错误,已退出线程") + # send_mail("UserSpider发生意料之外的错误,已退出线程") finally: logger.warning("UserSpider finished with exit code 0") user.html_downloader.proxies.__exit__() + if __name__ == '__main__': TS = TopicSpider() QS = QuestionSpider() @@ -131,21 +150,21 @@ def run(self): US = UserSpider() TS.start() - logger.info("Next thread will be start after 10s") - sleep(10) + logger.info("Next thread will be start after 7.5s") + sleep(7.5) QS.start() - logger.info("Next thread will be start after 10s") - sleep(10) + logger.info("Next thread will be start after 7.5s") + sleep(7.5) CS.start() - logger.info("Next thread will be start after 10s") - sleep(10) + logger.info("Next thread will be start after 7.5s") + sleep(7.5) US.start() logger.warning("爬虫进程启动完成,启动监控进程") # watching + TS_i = QS_i = CS_i = US_i = 1 while True: - TS_i = QS_i = CS_i = US_i = 1 - if not TS.is_alive() and (redis.llen(config.TOPIC_ID_SET) or redis.llen(config.TOPIC_SET)): + if TS_i != 3 and not TS.is_alive() and (redis.llen("list_"+config.TOPIC_ID_SET) or redis.llen("list_"+config.TOPIC_SET)): for i in range(1, 4): if TS.is_alive(): continue @@ -156,7 +175,7 @@ def run(self): if i == 3 and not TS.is_alive(): logger.error("Active thread TS failed") send_mail("TS is exit and try to activate it failed") - if not QS.is_alive() and (redis.llen(config.QUESTION_ID_SET) or redis.llen(config.QUESTION_SET)): + if QS_i != 3 and not QS.is_alive() and (redis.llen("list_"+config.QUESTION_ID_SET) or redis.llen("list_"+config.QUESTION_SET)): for i in range(1, 4): if QS.is_alive(): QS_i = 1 @@ -168,7 +187,7 @@ def run(self): if i == 3 and not QS.is_alive(): logger.error("----- Active thread QS failed -----") send_mail("QS is exit and try to activate it failed") - if not CS.is_alive() and (redis.llen(config.ANSWER_ID_SET) or redis.llen(config.COMMENT_SET)): + if CS_i != 3 and not CS.is_alive() and (redis.llen("list_"+config.ANSWER_ID_SET) or redis.llen("list_"+config.COMMENT_SET)): for i in range(1, 4): if CS.is_alive(): CS_i = 1 @@ -180,7 +199,7 @@ def run(self): if i == 3 and not CS.is_alive(): logger.error("----- Active thread CS failed -----") send_mail("CS is exit and try to activate it failed") - if not US.is_alive() and redis.llen(config.USER_ID_SET): + if US_i != 3 and not US.is_alive() and redis.llen("list_"+config.USER_ID_SET): for i in range(1, 4): if US.is_alive(): US_i = 1 diff --git a/tools/KeyWordsSearch.py b/tools/KeyWordsSearch.py index 2190300..e13dc9b 100644 --- a/tools/KeyWordsSearch.py +++ b/tools/KeyWordsSearch.py @@ -89,14 +89,14 @@ def search(keyword): if not data_saver.mg_data_db.find_one({"KeyWord": KWD}): data_saver.mongo_insert({ "KeyWord": KWD, - "result": [] + "data": [] }) while True: res = json.loads(res) logger.info("Saving Data To MongoDB") for data in res['data']: - data_saver.mg_data_db.update_one({"KeyWord": KWD}, {'$addToSet': {"result": data}}) + data_saver.mg_data_db.update_one({"KeyWord": KWD}, {'$addToSet': {"data": data}}) html_parser.parse(data) if res['paging']['is_end']: logger.info("Paging is end, exit") diff --git a/utils/comment.py b/utils/comment.py index 8190b4b..648253b 100644 --- a/utils/comment.py +++ b/utils/comment.py @@ -21,6 +21,7 @@ def spider(answer_id: str) -> None: # 增量爬取评论 + offset = config.MONGO_DOC_LIMIT logger.info("Get comments for answer id: {0}".format(answer_id)) url = "https://www.zhihu.com/api/v4/answers/{}/root_comments?limit=10&offset=0&order=normal&status=open" \ .format(answer_id) @@ -33,26 +34,49 @@ def spider(answer_id: str) -> None: data_saver.mongo_insert({ "AnswerID": answer_id, "common_counts": res['paging']['totals'], - "result": [] + "limit": config.MONGO_DOC_LIMIT, + "offset": offset, + "end_url": "", + "data": [] }) try: while url_manager.list_not_null(): sleep(.3) - res = html_downloader.download(url_manager.get()) + url = url_manager.get() + try: + res = html_downloader.download(url) + except SpiderFrame.exception.RequestRetryError as e: + logger.error(e, exc_info=True) + url_manager.add_url(url) + sleep(1) + continue res = json_lds(res) for data in res['data']: - if data_saver.mg_data_db.find_one({"result.id": data["id"]}): + if len(data_saver.mg_data_db.find_one({"AnswerID": answer_id, "offset": offset})["data"]) >= 5000: + logger.warning("MongoDB document out of limit, Create new document and update offset") + offset += config.MONGO_DOC_LIMIT + data_saver.mongo_insert({ + "AnswerID": answer_id, + "common_counts": res['paging']['totals'], + "limit": config.MONGO_DOC_LIMIT, + "offset": offset, + "end_url": "", + "data": [] + }) + if data_saver.mg_data_db.find_one({"data.id": data["id"]}): # 已经存在的,不存储 continue - data_saver.mg_data_db.update_one({"AnswerID": answer_id}, {'$addToSet': {"result": data}}) + data_saver.mg_data_db.update_one({"AnswerID": answer_id, "offset": offset}, {'$addToSet': {"data": data}}) try: if data["author"]["url_token"] is not "": url_manager.add_id(id_set=config.USER_ID_SET, _id=data["author"]["member"]["url_token"]) except: pass + if res['paging']['is_end']: logger.info("Paging is end, exit") + data_saver.mg_data_db.update_one({"AnswerID": answer_id, "offset": offset}, {"$set": {"end_url": url}}) break url = res['paging']['next'] url_manager.add_url(url) diff --git a/utils/question.py b/utils/question.py index 305b456..958f31c 100644 --- a/utils/question.py +++ b/utils/question.py @@ -74,7 +74,7 @@ def spider(question_id: str): :param question_id: 问题ID """ url = "" - + offset = config.MONGO_DOC_LIMIT try: # 初始化URL队列,如果之前已经爬过,添加不进去,继续上次断点 html_parser.url_manager.add_url(url=_init_url_(question_id)) @@ -84,7 +84,7 @@ def spider(question_id: str): res = html_downloader.download(url) title, question, tag_list, follower, watched = html_parser.parse_base_question_info(res) - if not data_saver.mg_data_db.find_one({"QuestionId": question_id}): + if not data_saver.mg_data_db.find_one({"QuestionId": question_id, "offset": offset}): data_saver.mongo_insert({ "QuestionId": question_id, "title": title, @@ -92,20 +92,47 @@ def spider(question_id: str): "tag_list": tag_list, "follower": follower, "watched": watched, + "limit": config.MONGO_DOC_LIMIT, + "offset": offset, "end_url": "", - "result": [] + "data": [] }) # question detail while html_parser.url_manager.list_not_null(): sleep(.3) url = html_parser.url_manager.get() - res = html_downloader.download(url) - question_json = json.loads(res) + try: + res = html_downloader.download(url) + except SpiderFrame.exception.RequestRetryError as e: + logger.error(e, exc_info=True) + html_parser.url_manager.add_url(url) + sleep(1) + continue + try: + question_json = json.loads(res) + except: + logger.error("Json格式校验错误") + continue for data in question_json["data"]: + if len(data_saver.mg_data_db.find_one({"QuestionId": question_id, "offset": offset})["data"]) >= 5000: + logger.warning("MongoDB document out of limit, Create new document and update offset") + offset += config.MONGO_DOC_LIMIT + data_saver.mongo_insert({ + "QuestionId": question_id, + "title": title, + "question": question, + "tag_list": tag_list, + "follower": follower, + "watched": watched, + "limit": config.MONGO_DOC_LIMIT, + "offset": offset, + "end_url": "", + "data": [] + }) if config.QUESTION_ADD_ANSWER_ID: html_parser.url_manager.add_id(config.ANSWER_ID_SET, data["id"]) - if data_saver.mg_data_db.find_one({"result.id": data["id"]}) and data_saver.mg_data_db.find_one({"result.updated_time": data['updated_time']}): + if data_saver.mg_data_db.find_one({"data.id": data["id"]}) and data_saver.mg_data_db.find_one({"data.updated_time": data['updated_time']}): continue try: data.pop("excerpt") @@ -119,7 +146,7 @@ def spider(question_id: str): html_downloader.img_download(img_path, img_url) except: logger.error("Answer:{0}, Message: 图片下载失败".format(data["id"])) - data_saver.mg_data_db.update_one({"QuestionId": question_id}, {'$addToSet': {"result": data}}) + data_saver.mg_data_db.update_one({"QuestionId": question_id, "offset": offset}, {'$addToSet': {"data": data}}) try: if data["author"]["url_token"] is not "": html_parser.url_manager.add_id(id_set=config.USER_ID_SET, _id=data["author"]["url_token"]) @@ -129,7 +156,7 @@ def spider(question_id: str): # 游标,下个数据包URL if question_json["paging"]["is_end"]: logger.info("Question id {0} is complete! ".format(question_id)) - data_saver.mg_data_db.update_one({"QuestionId": question_id}, {"$set": {"end_url": url}}) + data_saver.mg_data_db.update_one({"QuestionId": question_id, "offset": offset}, {"$set": {"end_url": url}}) break next_url = question_json["paging"]["next"] diff --git a/utils/topic.py b/utils/topic.py index 33b0751..c7ad7ed 100644 --- a/utils/topic.py +++ b/utils/topic.py @@ -9,6 +9,7 @@ from bs4 import BeautifulSoup from time import sleep +import pymongo.errors import config import json @@ -35,6 +36,7 @@ def parse_base_topic_info(html: str): def spider(topic_id: str): + offset = config.MONGO_DOC_LIMIT url = "https://www.zhihu.com/api/v4/topics/{0}/feeds/top_question?include=data%5B%3F(target.type%3Dtopic_sticky_module)%5D.target.data%5B%3F(target.type%3Danswer)%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F(target.type%3Dtopic_sticky_module)%5D.target.data%5B%3F(target.type%3Danswer)%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B%3F(target.type%3Dtopic_sticky_module)%5D.target.data%5B%3F(target.type%3Darticle)%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B%3F(target.type%3Dtopic_sticky_module)%5D.target.data%5B%3F(target.type%3Dpeople)%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B%3F(target.type%3Danswer)%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Canswer_type%3Bdata%5B%3F(target.type%3Danswer)%5D.target.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B%3F(target.type%3Danswer)%5D.target.paid_info%3Bdata%5B%3F(target.type%3Darticle)%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B%3F(target.type%3Dquestion)%5D.target.annotation_detail%2Ccomment_count%3B&offset=5&limit=20".format( topic_id) url_manager.add_url(url) @@ -43,8 +45,7 @@ def spider(topic_id: str): base_url = "https://www.zhihu.com/topic/{0}/hot".format(topic_id) res = html_downloader.download(base_url) title, follower, question_num = parse_base_topic_info(res) - - if not data_saver.mg_data_db.find_one({"TopicId": topic_id}): + if not data_saver.mg_data_db.find_one({"TopicId": topic_id, "offset": offset}): data_saver.mongo_insert({ "TopicId": topic_id, "title": title, @@ -52,22 +53,46 @@ def spider(topic_id: str): # "child_tag_list": child_tag_list, "follower": follower, "question_num": question_num, + "limit": config.MONGO_DOC_LIMIT, + "offset": offset, "end_url": "", - "result": [], + "data": [], }) while url_manager.list_not_null(): sleep(.3) url = url_manager.get() - res = html_downloader.download(url) + try: + res = html_downloader.download(url) + except SpiderFrame.exception.RequestRetryError as e: + logger.error(e, exc_info=True) + url_manager.add_url(url) + sleep(1) + continue topic_json = json.loads(res) for data in topic_json["data"]: + if len(data_saver.mg_data_db.find_one({"TopicId": topic_id, "offset": offset})["data"]) >= 5000: + logger.warning("MongoDB document out of limit, Create new document and update offset") + offset += config.MONGO_DOC_LIMIT + data_saver.mongo_insert({ + "TopicId": topic_id, + "title": title, + # "father_tag_list": father_tag_list, + # "child_tag_list": child_tag_list, + "follower": follower, + "question_num": question_num, + "limit": config.MONGO_DOC_LIMIT, + "offset": offset, + "end_url": "", + "data": [], + }) if config.TOPIC_ADD_QUESTIONS_ID: url_manager.add_id(config.QUESTION_ID_SET, data["target"]["id"]) - if data_saver.mg_data_db.find_one({"result.id": data["target"]["id"]}): + # 是否存在于数据库 + if data_saver.mg_data_db.find_one({"data.id": data["target"]["id"]}): continue - data_saver.mg_data_db.update_one({"TopicId": topic_id}, {'$addToSet': {"result": data["target"]}}) + data_saver.mg_data_db.update_one({"TopicId": topic_id, "offset": offset}, {'$addToSet': {"data": data["target"]}}) try: if data["author"]["url_token"] is not "": url_manager.add_id(id_set=config.USER_ID_SET, _id=data["author"]["url_token"]) @@ -77,13 +102,14 @@ def spider(topic_id: str): # 游标,下个数据包URL if topic_json["paging"]["is_end"]: logger.warning("Topic id {0} is complete! ".format(topic_id)) - data_saver.mg_data_db.update_one({"TopicId": topic_id}, {"$set": {"end_url": url}}) + data_saver.mg_data_db.update_one({"TopicId": topic_id, "offset": offset}, {"$set": {"end_url": url}}) break next_url = topic_json["paging"]["next"] logger.info("New url has been find: <{0} ... {1}>".format(url[:21], url[-11:])) url_manager.add_url(next_url) - + except pymongo.errors.WriteError as e: + logger.warning("MongoDB document out of limit, Create new document and update offset") except Exception as e: logger.critical("Fatal Error, Message:{0}, With url: <{0}>, Saving data and exit".format(e, url), exc_info=True) url_manager.force_add_url(url) @@ -98,7 +124,7 @@ def spider(topic_id: str): if __name__ == '__main__': logger.info("Run as main") - spider("19610306") + spider("19564758") # 结束线程 logger.info("Kill Proxies") html_downloader.proxies.__exit__()