Skip to content

Commit fcb279c

Browse files
author
asaph wilomousky
committed
Simple python script to identify mixed content in a URL list
0 parents  commit fcb279c

File tree

3 files changed

+90
-0
lines changed

3 files changed

+90
-0
lines changed

.vscode/launch.json

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
// Use IntelliSense to learn about possible attributes.
3+
// Hover to view descriptions of existing attributes.
4+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5+
"version": "0.2.0",
6+
"configurations": [
7+
{
8+
"name": "Python Debugger: Current File",
9+
"type": "debugpy",
10+
"request": "launch",
11+
"program": "${file}",
12+
"console": "integratedTerminal"
13+
}
14+
]
15+
}

find_mixed_content.py

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
from urllib.parse import urlparse, urljoin
4+
import os
5+
6+
def find_mixed_content(url, crawled_urls=set(), reported_urls=set(), max_depth=1, current_depth=0):
7+
if current_depth > max_depth or url in crawled_urls:
8+
return
9+
10+
crawled_urls.add(url)
11+
12+
try:
13+
response = requests.get(url)
14+
if response.status_code != 200:
15+
print(f"Failed to fetch the URL: {url}")
16+
return
17+
18+
soup = BeautifulSoup(response.content, 'html.parser')
19+
20+
# List of tags and their attribute that contain URLs
21+
tags_attributes = {
22+
'img': 'src',
23+
'link': 'href',
24+
'script': 'src',
25+
'iframe': 'src',
26+
'video': 'src',
27+
'audio': 'src',
28+
'source': 'src',
29+
'embed': 'src',
30+
'object': 'data'
31+
}
32+
33+
mixed_content = []
34+
for tag, attr in tags_attributes.items():
35+
for element in soup.find_all(tag):
36+
src = element.get(attr)
37+
if src and urlparse(src).scheme == 'http':
38+
mixed_content.append(os.path.basename(src))
39+
40+
if mixed_content and url not in reported_urls:
41+
print(f"Mixed content found on {url}:")
42+
for item in mixed_content:
43+
print(item)
44+
reported_urls.add(url)
45+
else:
46+
if url not in reported_urls:
47+
print(f"No mixed content found on {url}.")
48+
reported_urls.add(url)
49+
50+
# If deep link parameter is enabled, find and check all internal links
51+
if current_depth < max_depth:
52+
for link in soup.find_all('a', href=True):
53+
link_url = urljoin(url, link['href'])
54+
parsed_link_url = urlparse(link_url)
55+
# Avoid crawling external links and already crawled links
56+
if parsed_link_url.scheme in ['http', 'https'] and link_url not in crawled_urls:
57+
find_mixed_content(link_url, crawled_urls, reported_urls, max_depth, current_depth + 1)
58+
59+
except Exception as e:
60+
print(f"An error occurred: {e}")
61+
62+
def check_urls_from_file(file_path, max_depth=2):
63+
try:
64+
with open(file_path, 'r') as file:
65+
urls = [line.strip() for line in file if line.strip()]
66+
for url in urls:
67+
find_mixed_content(url, max_depth=max_depth)
68+
except Exception as e:
69+
print(f"An error occurred while reading the file: {e}")
70+
71+
# Example usage
72+
file_path = 'urls.txt' # Path to your file containing URLs
73+
max_depth = 2 # Change the depth as needed
74+
check_urls_from_file(file_path, max_depth=max_depth)

urls.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
https://googlesamples.github.io/web-fundamentals/fundamentals/security/prevent-mixed-content/simple-example.html

0 commit comments

Comments
 (0)