-
Notifications
You must be signed in to change notification settings - Fork 257
/
Copy pathcheck_links.py
193 lines (162 loc) · 5.87 KB
/
check_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import argparse
import os
import webbrowser
from fnmatch import fnmatch
from random import shuffle
from time import sleep
from typing import Dict
from urllib.parse import urlparse
import bs4
from fake_headers import Headers
from requests import get
from tqdm.asyncio import tqdm
parser = argparse.ArgumentParser(description="Check links in a directory")
parser.add_argument(
"-d",
"--directory",
help="Directory to check",
default=os.path.dirname(os.path.realpath(__file__)),
)
parser.add_argument(
"-f",
"--fast",
action="store_true",
help="Fast async mode",
)
parser.add_argument(
"-ff",
"--superfast",
action="store_true",
help="Super fast mode (may get blocked by sites)",
)
args = parser.parse_args()
# Gets script location. Assumes the script is in the lopp.net folder.
os.chdir(args.directory)
# Create a list of all the HTML files in lopp.net
all_html_files = []
for root, dirs, files in os.walk(os.getcwd()):
for file in files:
if fnmatch(file, "*.html"):
all_html_files.append(os.path.join(root, file))
# Parse each HTML and create a list of links associated with each HTML file
all_links = []
for html_file in all_html_files:
with open(html_file, "r") as f:
soup = bs4.BeautifulSoup(f, "html.parser")
for link in soup.find_all("a"):
all_links.append(link.get("href"))
# Remove all duplicate links and those pointing to other pages in lopp.net
print(f"Total number of links before processing: {len(all_links)}")
all_links = list(set(all_links)) # Removes duplicate links
shuffle(
all_links
) # Shuffles the list so we don't hit the same website twice in a row and get blocked
# For some reason, not all the links are removed in one pass so we keep doing it until we've actually removed all the unwanted links
for i in range(5):
for link in all_links:
if link is None:
continue
if link[0] == "#":
# ignore anchor links
all_links.remove(link)
elif link[:4] != "http":
# If the link is not a valid URL, remove it
all_links.remove(link)
elif link.find("lopp.net") != -1:
# Ignores the link if it points to one of the other pages in lopp.net or blog.lopp.net
all_links.remove(link)
elif link.find(".onion") != -1:
# Ignores the link if it is a tor address
all_links.remove(link)
print(f"Total number of links after processing: {len(all_links)}")
# Iterate over each link and download the page
failed_links = []
headers = Headers(headers=True).generate()
if args.fast or args.superfast:
import asyncio
import aiohttp
sorted_links: Dict = {}
# Create the top level domain list
for link in all_links:
domain = urlparse(link).netloc
if domain not in sorted_links:
sorted_links[domain] = []
# Add the links to the sorted_links dict
for link in all_links:
domain = urlparse(link).netloc
sorted_links[domain].append(link)
# Find the length of the longest list
length = 0
for domain, links in sorted_links.items():
length = len(links) if len(links) > length else length # I❤️ oneliners
# Get the first link from each domain
set_of_links = []
counter = 0
for i in range(length):
print(f"Counter: {counter}")
for domain in sorted_links:
if counter < len(sorted_links[domain]):
if (
sorted_links[domain][counter] not in set_of_links
and sorted_links[domain][counter][:4] == "http"
):
set_of_links.append(sorted_links[domain][counter])
counter += 1
# print(set_of_links)
async def get_resp(session, url):
try:
async with session.get(url) as response:
print(url)
if response.status != 200:
failed_links.append(url)
await response.text()
except Exception as e:
print(e)
failed_links.append(url)
async def download_links():
timeout = aiohttp.ClientTimeout(total=10)
async with aiohttp.ClientSession(
headers=headers, timeout=timeout
) as session:
tasks = []
for link in set_of_links:
task = asyncio.create_task(get_resp(session, link))
tasks.append(task)
await asyncio.gather(*tasks)
# Run the async function
asyncio.run(download_links())
# Clear the set of links
set_of_links = []
# Pause for 3 seconds
if not args.superfast:
sleep(3)
else:
for link in tqdm(all_links):
try:
r = get(link, headers=headers, timeout=10)
if r.status_code != 200:
failed_links.append(link)
except Exception as e:
print(e)
failed_links.append(link)
print(f"Number of failed links: {len(failed_links)}")
really_failed_links = []
for link in failed_links:
webbrowser.open_new_tab(link)
print(link)
if input("Is this link working?[y]/n ") == "n":
really_failed_links.append(link)
# Search all the HTML files for the failed links and print them out
files_with_failed_links = []
for html_file in all_html_files:
with open(html_file, "r") as f:
soup = bs4.BeautifulSoup(f, "html.parser")
for link in soup.find_all("a"):
if link.get("href") in really_failed_links:
files_with_failed_links.append(f"{html_file} - {link.get('href')}")
break
# Finally, output a list of the really broken links and their associated HTML files to a text file
os.chdir("..")
with open("broken_links.txt", "w+") as f:
for link in files_with_failed_links:
f.write(link + "\n")