-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrss_archive.py
executable file
·311 lines (256 loc) · 11.1 KB
/
rss_archive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
#!/usr/bin/python
import feedparser
import time
import os
import requests
import requests.exceptions
import json
import sqlite3
from datetime import datetime
from urllib.parse import urlparse
import tldextract
import traceback
import concurrent.futures
import tzlocal
import random
from tqdm import tqdm
import hashlib
import pickle
from file_cache import FileCache
# Constants
DB_FILE = 'archive.db'
MAX_RETRIES = 3
RETRY_DELAY = 7
MAX_LINK_ARCHIVAL_TIME = 200 # 5 minutes
# ANSI escape codes for text color
GREEN = '\033[32m'
RED = '\033[31m'
YELLOW = '\033[33m'
MAGENTA = '\033[35m'
RESET = '\033[0m'
# Initialize FileCache
cache_dir = 'file_cache'
expiry_time = 60 * 60 # 1 hour
cache = FileCache(cache_dir, expiry_time)
def create_database():
"""Create the SQLite database and tables if they don't exist."""
conn = sqlite3.connect(DB_FILE)
cursor = conn.cursor()
# Create a table for storing RSS feed URLs
cursor.execute('''
CREATE TABLE IF NOT EXISTS rss_feeds (
id INTEGER PRIMARY KEY,
url TEXT UNIQUE
)
''')
conn.commit()
conn.close()
def create_archive_table():
"""Create a table for storing archived links with TLD."""
conn = sqlite3.connect(DB_FILE)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS archived_links (
id INTEGER PRIMARY KEY,
url TEXT UNIQUE,
tld TEXT -- Add a column for TLD
)
''')
conn.commit()
conn.close()
def timestamp():
"""Get the current timestamp in a readable format with blue color."""
local_timezone = tzlocal.get_localzone()
current_time = datetime.now(local_timezone).strftime("%H:%M:%S")
return f'\033[34m{current_time}\033[0m'
def get_rss_feed_urls_from_file():
"""Get the list of RSS feed URLs from the 'rss_urls' file."""
rss_feed_urls = []
try:
with open('rss_urls', 'r') as file:
rss_feed_urls = [line.strip() for line in file]
except FileNotFoundError:
tqdm.write("RSS feed URLs file 'rss_urls' not found.")
return rss_feed_urls
def is_link_archived(link):
key = f'archived_{link}'
data = cache.retrieve(key)
if data is not None:
return data
availability_url = f'https://archive.org/wayback/available?url={link}'
try:
response = requests.get(availability_url)
response.raise_for_status() # Check if the request was successful
try:
data = response.json()
closest_snapshot = data.get('archived_snapshots', {}).get('closest', {})
is_available = closest_snapshot.get('available') is True
except json.JSONDecodeError:
tqdm.write(f'{timestamp()} {RED}[ERROR PARSING JSON RESPONSE]: {RESET} {link}')
is_available = False
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout, requests.exceptions.RequestException) as e:
tqdm.write(f"{timestamp()} {RED}[ERROR]: {RESET}{YELLOW}{e}{RESET}")
is_available = False
# Store the result in the cache before returning
cache.store(key, is_available)
return is_available
def format_request_error(e):
error_lines = traceback.format_exception(type(e), e, e.__traceback__)
formatted_error = "\n".join([
f"{RED}[REQUEST ERROR]:",
f" {RESET}{error_lines[-2]}",
f" {RESET}{error_lines[-1]}",
f"{RESET}"
])
return formatted_error
def archive_link(link):
"""Archive a link to the Wayback Machine with retry mechanism."""
# Print progress information before archiving the link
tqdm.write(f'{timestamp()} {MAGENTA}[ARCHIVING]: {RESET}{link}')
for attempt in range(MAX_RETRIES):
try:
# Extract the TLD using tldextract
ext = tldextract.extract(link)
tld = f'{ext.domain}.{ext.suffix}'
# Check if the link is already archived in the database
if is_link_in_database(link):
tqdm.write(f'{timestamp()} {YELLOW}[SKIP - ALREADY ARCHIVED LOCALLY]: {RESET}{link}')
return True
# Check if the link is already archived on the Wayback Machine
if is_link_archived(link):
tqdm.write(f'{timestamp()} {YELLOW}[SKIP - ALREADY ARCHIVED ON WAYBACK MACHINE]: {RESET}{link}')
insert_archived_link(link, tld)
return True
# Save the link to the Internet Archive
wayback_machine_url = 'https://web.archive.org/save/' + link
response = requests.get(wayback_machine_url, timeout=60) # Set timeout
if response.status_code == 200:
tqdm.write(f'{timestamp()} {GREEN}[SUCCESSFULLY ARCHIVED]: {RESET}{link}')
insert_archived_link(link, tld)
return True
else:
tqdm.write(f'{timestamp()} {RED}[ERROR ARCHIVING]: {RESET}{link} {RED}[HTTP RESPONSE CODE]:{RESET} {YELLOW}{response.status_code}{RESET}')
except requests.exceptions.RequestException as e:
error_message = format_request_error(e)
retry_message = f'{timestamp()} {YELLOW}[RETRYING] (ATTEMPT {attempt+1}/{MAX_RETRIES}): {RESET}{link}'
# Format the error message for clarity
formatted_error = "\n".join(["-"*50, error_message, retry_message, "-"*50])
tqdm.write(f'{formatted_error}')
except Exception as e:
tqdm.write(f"{timestamp()} {RED}[UNEXPECTED ERROR]: {RESET}{e}")
time.sleep(RETRY_DELAY)
return False
def is_link_in_database(link):
"""Check if a link is already archived in the database."""
conn = sqlite3.connect(DB_FILE)
cursor = conn.cursor()
cursor.execute('SELECT url FROM archived_links WHERE url = ?', (link,))
result = cursor.fetchone()
conn.close()
return result is not None
def insert_archived_link(link, tld):
"""Insert an archived link into the database with TLD."""
# Remove the 'www.' part from the tld
tld = tld.replace('www.', '')
conn = sqlite3.connect(DB_FILE)
cursor = conn.cursor()
cursor.execute('INSERT OR IGNORE INTO archived_links (url, tld) VALUES (?, ?)', (link, tld))
conn.commit()
conn.close()
def download_rss_feed(rss_feed_url):
"""Download an RSS feed and return its entries."""
# Use the URL as the key for caching
key = f'rss_feed_{rss_feed_url}'
# Try to retrieve the ETag for the RSS feed URL from the cache
etag = cache.retrieve_etag(rss_feed_url)
headers = {}
if etag:
headers['If-None-Match'] = etag
# Try to retrieve the feed from the cache
feed_entries = cache.retrieve(key)
# If the feed is not in the cache or is expired, download it
if feed_entries is None:
try:
# Fetch the RSS feed with a 30-second timeout and the ETag in the headers
response = requests.get(rss_feed_url, timeout=30, headers=headers)
# Handle the response based on the status code
if response.status_code == 304:
# The feed has not been modified, use the cached version
feed_entries = cache.retrieve(key)
elif response.status_code == 200:
# Parse the feed and cache it
feed = feedparser.parse(response.text)
feed_entries = feed.entries
cache.store(key, feed_entries)
else:
tqdm.write(f"{RED}[FAILED TO DOWNLOAD RSS FEED]:{RESET} {rss_feed_url} {RED}[HTTP RESPONSE CODE]:{RESET} {YELLOW}{response.status_code}{RESET}")
feed_entries = [] # Set feed_entries to an empty list
except requests.RequestException as e:
tqdm.write(f"{RED}[ERROR DOWNLOADING RSS FEED]:{RESET} {rss_feed_url} {RED}[RESPONSE]:{RESET} {YELLOW}{e}{RESET}")
feed_entries = [] # Set feed_entries to an empty list
except Exception as e:
tqdm.write(f"{RED}[UNEXPECTED ERROR]: {RESET}{e}")
return feed_entries or [] # Ensure the function always returns a list
def download_rss_feeds():
"""Download RSS feeds concurrently and return the list of feed entries."""
all_entries = []
# Get the list of RSS feed URLs from the 'rss_urls' file
rss_feed_urls = get_rss_feed_urls_from_file()
# Shuffle the RSS feed URLs randomly
random.shuffle(rss_feed_urls)
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(download_rss_feed, rss_url) for rss_url in rss_feed_urls]
with tqdm(total=len(rss_feed_urls), desc="Downloading RSS Feeds", ncols=100) as pbar:
for future in concurrent.futures.as_completed(futures):
try:
entries = future.result()
all_entries.extend(entries)
except Exception as e:
tqdm.write(f"An error occurred: {e}")
pbar.update(1)
return all_entries
def main():
"""Main function to retrieve RSS feeds, download links, and archive them to the Wayback Machine."""
create_database()
create_archive_table()
# Get the current time
current_time = datetime.now(tzlocal.get_localzone()).strftime("%H:%M:%S")
tqdm.write(f'Running script at {current_time}')
# Download RSS feeds and get the list of feed entries
all_entries = download_rss_feeds()
# Create a list to store links that need to be archived
links_to_archive = []
# Loop through each entry and check if the link is in the database
for i, entry in enumerate(all_entries, start=1):
if hasattr(entry, 'link'):
link = entry.link
# Check if the link is already archived in the database
if not is_link_in_database(link):
# Add the link to the list of links to be archived
links_to_archive.append(link)
else:
tqdm.write(f'{timestamp()} {RED}[ERROR]: Entry does not have a link attribute{RESET}')
# Shuffle the list of links to be archived randomly
random.shuffle(links_to_archive)
# Initialize tqdm progress bar
progress_bar = tqdm(total=len(links_to_archive), desc="ARCHIVING", position=0, leave=True)
# Create a ThreadPoolExecutor to process the links concurrently
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
# Create a list to store future objects
futures = []
# Loop through each link to be archived and submit it for archiving
for link in links_to_archive:
# Submit the link for archiving and store the future object
future = executor.submit(archive_link, link)
futures.append(future)
# Iterate through the completed futures and handle their results or exceptions
for future in concurrent.futures.as_completed(futures):
try:
# Get the result of the future (will raise an exception if the function errored)
result = future.result()
# Update the progress bar
progress_bar.update(1)
except Exception as e:
tqdm.write(f"{timestamp()} {RED}[ERROR DURING ARCHIVING]: {e}{RESET}")
if __name__ == '__main__':
main()