First commit

hz0ne · Apr 8, 2021 · 2df539b · 2df539b
commit 2df539b
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+
+# Ignore dynaconf secret files
+.secrets.*
diff --git a/config.py b/config.py
@@ -0,0 +1,10 @@
+
+from dynaconf import Dynaconf
+
+settings = Dynaconf(
+    envvar_prefix="DYNACONF",
+    settings_files=['settings.toml', '.secrets.toml'],
+)
+
+# `envvar_prefix` = export envvars with `export DYNACONF_FOO=bar`.
+# `settings_files` = Load this files in the order.
diff --git a/main.py b/main.py
@@ -0,0 +1,68 @@
+from bs4 import BeautifulSoup
+import requests
+from config import settings
+"""
+    爬虫脚本demo
+    爬取MIT课程网站的PDF
+    原本目的是抓取校园网内部服务某个文件夹下的所有PDF文件,由于站方下架所有PDF，只能改变方向
+    
+    使用DynaConf包调用全局配置
+    
+    author: hessen
+"""
+
+
+def downloadFile(download_links):
+    """
+    download files
+    :param download_links:
+    :return:
+    """
+    print("====={:^20}=====".format("Stared Downloading"))
+    print(download_links)
+    print("====={:^20}=====".format("Over Download"))
+
+
+def analyzeLink_PDF(root_link, link_start):
+    """
+    analyze root link, parse the PDF address
+    :param root_link: the PDFs website
+    :param link_start: web server https address header [Https://xxxx]
+    :return: all target links
+    """
+    # contain all target_links
+    targetLinks = []
+
+    # 伪装浏览器头部
+    send_headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
+        "Connection": "keep-alive",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
+        "Accept-Language": "zh-CN,zh;q=0.8",
+        "Cookie": "PHPSESSID=0dc480ae4ac40257fd4b9ec09c517d67"}
+
+    print("====={:^20}=====".format("Analyzing links"))
+
+    r = requests.get(root_link, send_headers)
+    if r.status_code == 200:
+        soup = BeautifulSoup(r.text, "html5lib")
+
+        index = 1
+        a_tags = soup.find_all("a")
+        for tag in a_tags:
+            linkEnd = tag.get('href')
+            if linkEnd.endswith('.pdf'):
+                targetLinks.append(link_start + linkEnd)
+            index += 1
+
+    print("Toatal links {:<5d}".format(index))
+    return targetLinks
+    # print(index)
+
+
+if __name__ == "__main__":
+    # load config
+    root_link = settings.root_link
+    link_start = settings.link_start
+    dLinks = analyzeLink_PDF(root_link, link_start)
+    downloadFile(dLinks)
diff --git a/settings.toml b/settings.toml
@@ -0,0 +1,3 @@
+root_link = "https://pdos.csail.mit.edu/6.824/schedule.html"
+link_start = "https://pdos.csail.mit.edu/6.824/"
+download_local_dir = "D:\\Lecture-PDF"