Create wayback.py

manoselva · web-flow · commit 2c6c3ca7ddda · 2025-01-18T19:55:57.000+05:30
diff --git a/wayback.py b/wayback.py
@@ -0,0 +1,146 @@
+import os
+import requests
+from urllib.parse import urlparse
+import re
+import time
+from colorama import Fore, Style, Back
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+
+
+def display_banner():
+    """
+    Display a banner for the script start.
+    """
+    banner = f"""
+{Style.BRIGHT}{Fore.GREEN}
+##########################################
+#                                        #
+#     {Fore.CYAN}WEB ARCHIVE DATA FETCHING SCRIPT{Fore.GREEN}   #
+#                                        #
+##########################################{Style.RESET_ALL}
+"""
+    print(banner)
+
+
+def get_response_with_retries(url, retries=3, backoff_factor=0.3):
+    """
+    Return the response from a URL with retry mechanism.
+    """
+    session = requests.Session()
+    retry = Retry(total=retries, backoff_factor=backoff_factor, status_forcelist=[500, 502, 503, 504])
+    adapter = HTTPAdapter(max_retries=retry)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    return session.get(url)
+
+
+def fetch_web_archive_data(domain):
+    """
+    Fetch data from the Wayback Machine for a given domain and save it to files.
+    """
+    start_time = time.time()
+    url = f"https://web.archive.org/cdx/search/cdx?url=*.{domain}/*&collapse=urlkey&output=text&fl=original"
+
+    try:
+        response = get_response_with_retries(url)
+        response.raise_for_status()
+
+        # Create directories for saving results
+        main_dir = "web-archive"
+        os.makedirs(main_dir, exist_ok=True)
+        domain_dir = os.path.join(main_dir, domain)
+        os.makedirs(domain_dir, exist_ok=True)
+
+        # Save all URLs to a file
+        archive_file = os.path.join(domain_dir, f"{domain}.txt")
+        urls = response.text.strip().split("\n")
+        total_urls = len(urls)
+
+        with open(archive_file, "w", encoding="utf-8") as file:
+            file.write("\n".join(urls))
+
+        # Process each URL in the response content
+        urls = response.text.strip().split("\n")
+        total_urls = len(urls)  # Count total URLs
+        subdomains = set()
+        directories = set()
+
+        # Define the regex pattern for file extensions to filter
+        extensions_pattern = re.compile(r"\.xls$|\.xml$|\.xlsx$|\.json$|\.pdf$|\.sql$|\.doc$|\.docx$|\.pptx$|\.txt$|\.zip$|\.tar\.gz$|\.tgz$|\.bak$|\.7z$|\.rar$|\.log$|\.cache$|\.secret$|\.db$|\.backup$|\.yml$|\.gz$|\.config$|\.csv$|\.yaml$|\.md$|\.md5$|\.exe$|\.dll$|\.bin$|\.ini$|\.bat$|\.sh$|\.tar$|\.deb$|\.rpm$|\.iso$|\.img$|\.apk$|\.msi$|\.dmg$|\.tmp$|\.crt$|\.pem$|\.key$|\.pub$|\.asc$", re.IGNORECASE)
+
+        # Process each URL in the response content
+        urls = response.text.strip().split("\n")
+        total_urls = len(urls)  # Count total URLs
+        subdomains = set()
+        directories = set()
+
+        for url in urls:
+            parsed_url = urlparse(url)
+            subdomain = parsed_url.netloc
+            subdomains.add(subdomain)
+
+            if parsed_url.path:
+                directory = re.sub(r"(/[^/]*\.[^/]*$)|/$", "", parsed_url.path)
+                if directory:
+                    directories.add(f"{subdomain}{directory}")
+
+            if extensions_pattern.search(url):
+                extension = extensions_pattern.search(url).group().lstrip(".").lower()
+                extension_file = os.path.join(domain_dir, f"{extension}.txt")
+
+                # Append the URL to the respective extension file
+                with open(extension_file, "a", encoding="utf-8") as file:
+                    file.write(url + "\n")
+
+        # Save unique subdomains
+        subdomains_file = os.path.join(domain_dir, "subdomains.txt")
+        with open(subdomains_file, "w", encoding="utf-8") as file:
+            file.write("\n".join(sorted(subdomains)))
+
+        # Save unique directories
+        directories_file = os.path.join(domain_dir, "directory.txt")
+        with open(directories_file, "w", encoding="utf-8") as file:
+            file.write("\n".join(sorted(directories)))
+
+        elapsed_time = time.time() - start_time
+        print(f"{Fore.BLUE}{Style.BRIGHT}---------------------------------------{Style.RESET_ALL}")
+        print(f"{Fore.RED}Domain: {domain} | Time: {elapsed_time:.2f}s{Style.RESET_ALL}")
+        print(f"{Fore.CYAN}URL: {total_urls} | Subdomains: {len(subdomains)} | Directories: {len(directories)}{Style.RESET_ALL}")
+
+    except requests.exceptions.RequestException as e:
+        print(f"{Fore.RED}Error processing {domain}: {e}{Style.RESET_ALL}")
+        print(f"{Fore.BLUE}{Style.BRIGHT}---------------------------------------{Style.RESET_ALL}")
+
+
+def process_input(input_data):
+    """
+    Process user input to handle a single domain or a file containing multiple domains.
+    """
+    total_start_time = time.time()
+
+    if os.path.isfile(input_data):
+        with open(input_data, "r", encoding="utf-8") as file:
+            domains = [line.strip() for line in file if line.strip()]
+    else:
+        domains = [input_data]
+
+    # Process each domain sequentially
+    for domain in domains:
+        fetch_web_archive_data(domain)
+
+    # Add a separator line above the total processing time
+    print(f"{Fore.BLUE}{Style.BRIGHT}======================================={Style.RESET_ALL}")
+    total_elapsed_time = time.time() - total_start_time
+    print(f"{Fore.GREEN}Total Processing Time: {total_elapsed_time:.2f} seconds{Style.RESET_ALL}")
+
+
+if __name__ == "__main__":
+    display_banner()
+    user_input = input(f"""
+{Back.BLACK}{Fore.WHITE}{Style.BRIGHT}Enter a domain or path to a .txt file containing domains: {Style.RESET_ALL}
+> """).strip()
+    if user_input:
+        process_input(user_input)
+    else:
+        print(f"{Fore.RED}Invalid input. Please provide a valid domain or file path.{Style.RESET_ALL}")