Skip to content

Commit 2c6c3ca

Browse files
authored
Create wayback.py
0 parents  commit 2c6c3ca

File tree

1 file changed

+146
-0
lines changed

1 file changed

+146
-0
lines changed

wayback.py

+146
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
import os
2+
import requests
3+
from urllib.parse import urlparse
4+
import re
5+
import time
6+
from colorama import Fore, Style, Back
7+
from requests.adapters import HTTPAdapter
8+
from requests.packages.urllib3.util.retry import Retry
9+
10+
11+
def display_banner():
12+
"""
13+
Display a banner for the script start.
14+
"""
15+
banner = f"""
16+
{Style.BRIGHT}{Fore.GREEN}
17+
##########################################
18+
# #
19+
# {Fore.CYAN}WEB ARCHIVE DATA FETCHING SCRIPT{Fore.GREEN} #
20+
# #
21+
##########################################{Style.RESET_ALL}
22+
"""
23+
print(banner)
24+
25+
26+
def get_response_with_retries(url, retries=3, backoff_factor=0.3):
27+
"""
28+
Return the response from a URL with retry mechanism.
29+
"""
30+
session = requests.Session()
31+
retry = Retry(total=retries, backoff_factor=backoff_factor, status_forcelist=[500, 502, 503, 504])
32+
adapter = HTTPAdapter(max_retries=retry)
33+
session.mount("http://", adapter)
34+
session.mount("https://", adapter)
35+
return session.get(url)
36+
37+
38+
def fetch_web_archive_data(domain):
39+
"""
40+
Fetch data from the Wayback Machine for a given domain and save it to files.
41+
"""
42+
start_time = time.time()
43+
url = f"https://web.archive.org/cdx/search/cdx?url=*.{domain}/*&collapse=urlkey&output=text&fl=original"
44+
45+
try:
46+
response = get_response_with_retries(url)
47+
response.raise_for_status()
48+
49+
# Create directories for saving results
50+
main_dir = "web-archive"
51+
os.makedirs(main_dir, exist_ok=True)
52+
domain_dir = os.path.join(main_dir, domain)
53+
os.makedirs(domain_dir, exist_ok=True)
54+
55+
# Save all URLs to a file
56+
archive_file = os.path.join(domain_dir, f"{domain}.txt")
57+
urls = response.text.strip().split("\n")
58+
total_urls = len(urls)
59+
60+
with open(archive_file, "w", encoding="utf-8") as file:
61+
file.write("\n".join(urls))
62+
63+
# Process each URL in the response content
64+
urls = response.text.strip().split("\n")
65+
total_urls = len(urls) # Count total URLs
66+
subdomains = set()
67+
directories = set()
68+
69+
# Define the regex pattern for file extensions to filter
70+
extensions_pattern = re.compile(r"\.xls$|\.xml$|\.xlsx$|\.json$|\.pdf$|\.sql$|\.doc$|\.docx$|\.pptx$|\.txt$|\.zip$|\.tar\.gz$|\.tgz$|\.bak$|\.7z$|\.rar$|\.log$|\.cache$|\.secret$|\.db$|\.backup$|\.yml$|\.gz$|\.config$|\.csv$|\.yaml$|\.md$|\.md5$|\.exe$|\.dll$|\.bin$|\.ini$|\.bat$|\.sh$|\.tar$|\.deb$|\.rpm$|\.iso$|\.img$|\.apk$|\.msi$|\.dmg$|\.tmp$|\.crt$|\.pem$|\.key$|\.pub$|\.asc$", re.IGNORECASE)
71+
72+
# Process each URL in the response content
73+
urls = response.text.strip().split("\n")
74+
total_urls = len(urls) # Count total URLs
75+
subdomains = set()
76+
directories = set()
77+
78+
for url in urls:
79+
parsed_url = urlparse(url)
80+
subdomain = parsed_url.netloc
81+
subdomains.add(subdomain)
82+
83+
if parsed_url.path:
84+
directory = re.sub(r"(/[^/]*\.[^/]*$)|/$", "", parsed_url.path)
85+
if directory:
86+
directories.add(f"{subdomain}{directory}")
87+
88+
if extensions_pattern.search(url):
89+
extension = extensions_pattern.search(url).group().lstrip(".").lower()
90+
extension_file = os.path.join(domain_dir, f"{extension}.txt")
91+
92+
# Append the URL to the respective extension file
93+
with open(extension_file, "a", encoding="utf-8") as file:
94+
file.write(url + "\n")
95+
96+
# Save unique subdomains
97+
subdomains_file = os.path.join(domain_dir, "subdomains.txt")
98+
with open(subdomains_file, "w", encoding="utf-8") as file:
99+
file.write("\n".join(sorted(subdomains)))
100+
101+
# Save unique directories
102+
directories_file = os.path.join(domain_dir, "directory.txt")
103+
with open(directories_file, "w", encoding="utf-8") as file:
104+
file.write("\n".join(sorted(directories)))
105+
106+
elapsed_time = time.time() - start_time
107+
print(f"{Fore.BLUE}{Style.BRIGHT}---------------------------------------{Style.RESET_ALL}")
108+
print(f"{Fore.RED}Domain: {domain} | Time: {elapsed_time:.2f}s{Style.RESET_ALL}")
109+
print(f"{Fore.CYAN}URL: {total_urls} | Subdomains: {len(subdomains)} | Directories: {len(directories)}{Style.RESET_ALL}")
110+
111+
except requests.exceptions.RequestException as e:
112+
print(f"{Fore.RED}Error processing {domain}: {e}{Style.RESET_ALL}")
113+
print(f"{Fore.BLUE}{Style.BRIGHT}---------------------------------------{Style.RESET_ALL}")
114+
115+
116+
def process_input(input_data):
117+
"""
118+
Process user input to handle a single domain or a file containing multiple domains.
119+
"""
120+
total_start_time = time.time()
121+
122+
if os.path.isfile(input_data):
123+
with open(input_data, "r", encoding="utf-8") as file:
124+
domains = [line.strip() for line in file if line.strip()]
125+
else:
126+
domains = [input_data]
127+
128+
# Process each domain sequentially
129+
for domain in domains:
130+
fetch_web_archive_data(domain)
131+
132+
# Add a separator line above the total processing time
133+
print(f"{Fore.BLUE}{Style.BRIGHT}======================================={Style.RESET_ALL}")
134+
total_elapsed_time = time.time() - total_start_time
135+
print(f"{Fore.GREEN}Total Processing Time: {total_elapsed_time:.2f} seconds{Style.RESET_ALL}")
136+
137+
138+
if __name__ == "__main__":
139+
display_banner()
140+
user_input = input(f"""
141+
{Back.BLACK}{Fore.WHITE}{Style.BRIGHT}Enter a domain or path to a .txt file containing domains: {Style.RESET_ALL}
142+
> """).strip()
143+
if user_input:
144+
process_input(user_input)
145+
else:
146+
print(f"{Fore.RED}Invalid input. Please provide a valid domain or file path.{Style.RESET_ALL}")

0 commit comments

Comments
 (0)