-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurl_checker.py
170 lines (135 loc) · 5.38 KB
/
url_checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/python3
"""
URL Status Checker - A tool to verify HTTP status codes and redirects for URLs.
"""
import argparse
import csv
import glob
import os
import sys
from datetime import datetime
from typing import Dict, List, Tuple
from urllib.parse import urljoin
import requests
HOST: str = "http://docs.testing.ansible.com"
REQUEST_TIMEOUT: int = 30 # seconds
def parse_arguments() -> argparse.Namespace:
parser = argparse.ArgumentParser()
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
"-d", "--directory", help="Directory that contains txt files with URL paths"
)
group.add_argument("-f", "--file", help="Single file that contains URL paths")
group.add_argument("-u", "--url", help="Single URL path to verify")
return parser.parse_args()
def get_txt_files(directory: str) -> List[str]:
if not os.path.isdir(directory):
print(f"Error: Directory {directory} not found")
sys.exit(1)
txt_files = glob.glob(os.path.join(directory, "*.txt"))
if not txt_files:
print(f"Error: No .txt files found in {directory}")
sys.exit(1)
return sorted(txt_files)
def get_output_filenames(input_file: str) -> Tuple[str, str]:
base_name = os.path.splitext(os.path.basename(input_file))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
base_output = f"url_report_{base_name}_{timestamp}"
return f"{base_output}.txt", f"{base_output}.csv"
def load_urls_from_file(file_path: str) -> List[str]:
try:
with open(file_path, "r", encoding="utf-8") as f:
return [line.strip() for line in f if line.strip()]
except FileNotFoundError:
print(f"Error: File {file_path} not found")
return []
except IOError as e:
print(f"Error reading {file_path}: {str(e)}")
return []
def check_url(url: str) -> Tuple[str, str]:
try:
response: requests.Response = requests.head(
url, allow_redirects=False, timeout=REQUEST_TIMEOUT
)
status: int = response.status_code
redirect_url: str = (
response.headers.get("Location", "") if status in [301, 302] else ""
)
if redirect_url and not redirect_url.startswith(("http://", "https://")):
redirect_url = urljoin(url, redirect_url)
return str(status), redirect_url
except requests.RequestException as e:
return str(e), ""
def process_urls(urls: List[str]) -> List[Dict[str, str]]:
results = []
for page in urls:
url = urljoin(HOST, page)
status, redirect = check_url(url)
result = {
"original_path": page,
"full_url": url,
"status": status,
"redirect_url": redirect,
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
}
results.append(result)
return results
def write_text_report(results: List[Dict[str, str]], output_file: str) -> None:
try:
with open(output_file, "w", encoding="utf-8") as f:
for result in results:
f.write(f"Original Path: {result['original_path']}\n")
f.write(f"Full URL: {result['full_url']}\n")
f.write(f"Status: {result['status']}\n")
if result["redirect_url"]:
f.write(f"Redirects to: {result['redirect_url']}\n")
f.write(f"Timestamp: {result['timestamp']}\n")
f.write("\n")
print(f"Text report generated: {output_file}")
except IOError as e:
print(f"Error writing to {output_file}: {str(e)}")
def write_csv_report(results: List[Dict[str, str]], output_file: str) -> None:
fieldnames = ["original_path", "full_url", "status", "redirect_url", "timestamp"]
try:
with open(output_file, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(results)
print(f"CSV report generated: {output_file}")
except IOError as e:
print(f"Error writing to {output_file}: {str(e)}")
def handle_single_url(url: str) -> None:
results = process_urls([url])
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
txt_file = f"url_report_single_{timestamp}.txt"
csv_file = f"url_report_single_{timestamp}.csv"
write_text_report(results, txt_file)
write_csv_report(results, csv_file)
def handle_single_file(file_path: str) -> None:
urls = load_urls_from_file(file_path)
if urls:
results = process_urls(urls)
txt_file, csv_file = get_output_filenames(file_path)
write_text_report(results, txt_file)
write_csv_report(results, csv_file)
def handle_directory(directory: str) -> None:
txt_files = get_txt_files(directory)
print(f"Found {len(txt_files)} txt files in {directory}")
for file_path in txt_files:
print(f"\nProcessing {file_path}...")
urls = load_urls_from_file(file_path)
if urls:
results = process_urls(urls)
txt_file, csv_file = get_output_filenames(file_path)
write_text_report(results, txt_file)
write_csv_report(results, csv_file)
def main() -> None:
args = parse_arguments()
if args.url:
handle_single_url(args.url)
elif args.file:
handle_single_file(args.file)
else:
handle_directory(args.directory)
if __name__ == "__main__":
main()