Skip to content

Commit e03bbe5

Browse files
committed
v1
1 parent c80df0a commit e03bbe5

File tree

2 files changed

+207
-0
lines changed

2 files changed

+207
-0
lines changed

generate_sitemap.py

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
import requests
2+
import xml.etree.ElementTree as ET
3+
from xml.dom import minidom
4+
from tqdm import tqdm
5+
import re
6+
7+
# --------------------------------------------------------------------
8+
# 1) Definitions & Constants
9+
# --------------------------------------------------------------------
10+
SUMMARY_URL_BOOK = "https://raw.githubusercontent.com/HackTricks-wiki/hacktricks/refs/heads/master/src/SUMMARY.md"
11+
SUMMARY_URL_CLOUD = "https://raw.githubusercontent.com/HackTricks-wiki/hacktricks-cloud/refs/heads/master/src/SUMMARY.md"
12+
13+
BOOK_DOMAIN = "book.hacktricks.wiki"
14+
CLOUD_DOMAIN = "cloud.hacktricks.wiki"
15+
16+
# Dictionary of languages and their codes
17+
languages = {
18+
"es": "es",
19+
"af": "af",
20+
"zh": "zh",
21+
"fr": "fr",
22+
"de": "de",
23+
"el": "el",
24+
"hi": "hi",
25+
"it": "it",
26+
"ja": "ja",
27+
"ko": "ko",
28+
"pl": "pl",
29+
"pt": "pt",
30+
"sr": "sr",
31+
"sw": "sw",
32+
"tr": "tr",
33+
"uk": "uk",
34+
}
35+
36+
# --------------------------------------------------------------------
37+
# 2) Helper Functions
38+
# --------------------------------------------------------------------
39+
def fetch_summary(url):
40+
"""Fetch the content of a SUMMARY.md-like file."""
41+
response = requests.get(url, timeout=30)
42+
response.raise_for_status()
43+
return response.text
44+
45+
def parse_paths_from_summary(summary_text):
46+
"""
47+
Parse the SUMMARY.md content and extract paths of the form:
48+
[Title](some/path.md)
49+
[Title](some/path/)
50+
[Title](some/README.md)
51+
etc.
52+
53+
According to your instructions:
54+
- Do NOT remove '/index' paths
55+
- Change '.md' to '.html'
56+
- Change '/README.md' -> '/index.html'
57+
58+
Returns a list of unique paths (without duplicates).
59+
"""
60+
# Regex to find standard Markdown links: [some text](some/path)
61+
# Capture everything inside parentheses after the bracket, ignoring any leading/trailing spaces.
62+
pattern = r"\[[^\]]+\]\(\s*([^)]+?)\s*\)"
63+
matches = re.findall(pattern, summary_text)
64+
65+
cleaned_paths = []
66+
for path in matches:
67+
# Trim whitespace just in case
68+
path = path.strip()
69+
70+
# 1) Handle /README.md -> /index.html
71+
# (anywhere in the path, not just the very end, but typically it should be at the end)
72+
if path.endswith("README.md"):
73+
path = path[:-9] + "index.html"
74+
75+
# 2) Else if it ends with .md -> .html
76+
elif path.endswith(".md"):
77+
path = path[:-3] + ".html"
78+
79+
# You asked NOT to remove /index or trailing slashes
80+
# so we won't do any extra trimming beyond that.
81+
82+
# Avoid duplicates
83+
if path not in cleaned_paths:
84+
cleaned_paths.append(path)
85+
86+
return cleaned_paths
87+
88+
def compute_priority_from_depth(path):
89+
"""
90+
The priority starts at 1.0 for depth 0,
91+
and each additional subfolder subtracts 0.1.
92+
Depth 0 => priority = 1.00
93+
Depth 1 => priority = 0.9
94+
Depth 2 => priority = 0.8
95+
...
96+
Min 0.5
97+
"""
98+
effective_path = path.strip('/')
99+
if not effective_path:
100+
depth = 0
101+
else:
102+
depth = effective_path.count('/')
103+
priority = 1.0 - (0.1 * depth)
104+
return max(priority, 0.5)
105+
106+
def prettify_xml(element):
107+
"""Return a prettified string representation of the XML with XML declaration."""
108+
rough_string = ET.tostring(element, encoding='utf-8')
109+
reparsed = minidom.parseString(rough_string)
110+
pretty = reparsed.toprettyxml(indent=" ", encoding="UTF-8")
111+
return pretty.decode('UTF-8')
112+
113+
def add_translated_urls(url_element, base_domain, path):
114+
"""
115+
Add translated URLs with language codes, e.g.:
116+
https://<base_domain>/<lang_code><path>
117+
118+
Also sets x-default to English by default.
119+
"""
120+
# We'll set x-default to the English version
121+
xdefault_link = ET.SubElement(url_element, '{http://www.w3.org/1999/xhtml}link')
122+
xdefault_link.set('rel', 'alternate')
123+
xdefault_link.set('hreflang', 'x-default')
124+
xdefault_link.set('href', f"https://{base_domain}/en/{path}")
125+
126+
# Add one <xhtml:link> for each language
127+
for lang_code in languages.values():
128+
alt_link = ET.SubElement(url_element, '{http://www.w3.org/1999/xhtml}link')
129+
alt_link.set('rel', 'alternate')
130+
alt_link.set('hreflang', lang_code)
131+
alt_link.set('href', f"https://{base_domain}/{lang_code}/{path}")
132+
133+
# --------------------------------------------------------------------
134+
# 3) Main logic
135+
# --------------------------------------------------------------------
136+
def main():
137+
print("**Fetching SUMMARY files**...")
138+
book_summary = fetch_summary(SUMMARY_URL_BOOK)
139+
cloud_summary = fetch_summary(SUMMARY_URL_CLOUD)
140+
141+
print("**Extracting paths from summaries**...")
142+
book_paths = parse_paths_from_summary(book_summary)
143+
cloud_paths = parse_paths_from_summary(cloud_summary)
144+
145+
# Prepare the output sitemap root
146+
ET.register_namespace('', "http://www.sitemaps.org/schemas/sitemap/0.9")
147+
ET.register_namespace('xhtml', "http://www.w3.org/1999/xhtml")
148+
root = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}urlset')
149+
150+
# ----------------------------------------------------------------
151+
# 3.1) Process Book paths
152+
# ----------------------------------------------------------------
153+
print("**Processing Book paths**...")
154+
for p in tqdm(book_paths, desc="Book paths"):
155+
# Create <url> element
156+
url_element = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')
157+
158+
# Our base location for English is domain/en/path
159+
loc_el = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
160+
full_en_url = f"https://{BOOK_DOMAIN}/en/{p}"
161+
loc_el.text = full_en_url
162+
163+
# Priority calculation
164+
priority_el = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}priority')
165+
priority_el.text = f"{compute_priority_from_depth(p):.2f}"
166+
167+
# Add translations
168+
add_translated_urls(url_element, BOOK_DOMAIN, p)
169+
root.append(url_element)
170+
171+
# ----------------------------------------------------------------
172+
# 3.2) Process Cloud paths
173+
# ----------------------------------------------------------------
174+
print("**Processing Cloud paths**...")
175+
for p in tqdm(cloud_paths, desc="Cloud paths"):
176+
# Create <url> element
177+
url_element = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')
178+
179+
# Our base location for English is domain/en/path
180+
loc_el = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
181+
full_en_url = f"https://{CLOUD_DOMAIN}/en/{p}"
182+
loc_el.text = full_en_url
183+
184+
# Priority calculation
185+
priority_el = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}priority')
186+
priority_el.text = f"{compute_priority_from_depth(p):.2f}"
187+
188+
# Add translations
189+
add_translated_urls(url_element, CLOUD_DOMAIN, p)
190+
root.append(url_element)
191+
192+
# ----------------------------------------------------------------
193+
# 3.3) Write the final sitemap
194+
# ----------------------------------------------------------------
195+
print("**Generating final sitemap**...")
196+
sitemap_xml = prettify_xml(root)
197+
with open("sitemap.xml", "w", encoding="utf-8") as f:
198+
f.write(sitemap_xml)
199+
200+
print("**sitemap.xml has been successfully generated.**")
201+
202+
if __name__ == "__main__":
203+
main()

robots.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Sitemap: https://www.hacktricks.wiki/sitemap.xml
2+
3+
User-agent: *
4+
Disallow:

0 commit comments

Comments
 (0)