-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
93 lines (76 loc) · 2.88 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from bs4 import BeautifulSoup
from config import settings
import requests
import os
"""
爬虫脚本demo
爬取MIT课程网站的PDF
原本目的是抓取校园网内部服务某个文件夹下的所有PDF文件,由于站方下架所有PDF,只能改变方向
使用DynaConf包调用全局配置
author: hessen
"""
def downloadFile(download_links, download_dir):
"""
download files
:param download_links:
:return:
"""
print("====={:^20}=====".format("Stared Downloading"))
for link in download_links:
save_postition = download_dir + '\\' + link.split('/')[-1]
r = requests.get(link, stream=True)
with open(save_postition, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
print("{} save to {}".format(link, save_postition))
print("====={:^20}=====".format("Over Downloads"))
def analyzeLink_PDF(root_link, link_start):
""" TODO: template this func
analyze root link, parse the PDF address
:param root_link: the PDFs website
:param link_start: web server https address header [Https://xxxx]
:return: all target links
"""
# contain all target_links
targetLinks = []
# 伪装浏览器头部
send_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
"Connection": "keep-alive",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8",
"Cookie": "PHPSESSID=0dc480ae4ac40257fd4b9ec09c517d67"}
print("====={:^20}=====".format("Analyzing links"))
r = requests.get(root_link, send_headers)
if r.status_code == 200:
soup = BeautifulSoup(r.text, "html5lib")
index = 0
a_tags = soup.find_all("a")
for tag in a_tags:
href_content = tag.get('href')
if href_content.endswith('.pdf'):
if href_content.startswith('http'):
targetLinks.append(href_content)
index += 1
else:
targetLinks.append(link_start + href_content)
index += 1
else:
continue
else:
print("Error, please check root_link?")
print("Total links {:<5d}".format(index))
return targetLinks
if __name__ == "__main__":
# load config
root_link = settings.root_link
link_start = settings.link_start
local_download_dir = settings.download_local_dir
# check download dir exist or not
if os.path.isdir(local_download_dir) is not True:
os.mkdir(local_download_dir)
# download
dLinks = analyzeLink_PDF(root_link, link_start)
downloadFile(dLinks, local_download_dir)