Skip to content

Commit 48ee315

Browse files
committed
generate sitemap improved
1 parent 2a843ce commit 48ee315

File tree

2 files changed

+221
-231
lines changed

2 files changed

+221
-231
lines changed

generate_sitemap.py

+17-27
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from xml.dom import minidom
44
from tqdm import tqdm
55
import re
6+
import urllib.parse
67

78
# --------------------------------------------------------------------
89
# 1) Definitions & Constants
@@ -57,28 +58,20 @@ def parse_paths_from_summary(summary_text):
5758
5859
Returns a list of unique paths (without duplicates).
5960
"""
60-
# Regex to find standard Markdown links: [some text](some/path)
61-
# Capture everything inside parentheses after the bracket, ignoring any leading/trailing spaces.
6261
pattern = r"\[[^\]]+\]\(\s*([^)]+?)\s*\)"
6362
matches = re.findall(pattern, summary_text)
6463

6564
cleaned_paths = []
6665
for path in matches:
67-
# Trim whitespace just in case
6866
path = path.strip()
6967

7068
# 1) Handle /README.md -> /index.html
71-
# (anywhere in the path, not just the very end, but typically it should be at the end)
7269
if path.endswith("README.md"):
7370
path = path[:-9] + "index.html"
74-
7571
# 2) Else if it ends with .md -> .html
7672
elif path.endswith(".md"):
7773
path = path[:-3] + ".html"
7874

79-
# You asked NOT to remove /index or trailing slashes
80-
# so we won't do any extra trimming beyond that.
81-
8275
# Avoid duplicates
8376
if path not in cleaned_paths:
8477
cleaned_paths.append(path)
@@ -113,22 +106,25 @@ def prettify_xml(element):
113106
def add_translated_urls(url_element, base_domain, path):
114107
"""
115108
Add translated URLs with language codes, e.g.:
116-
https://<base_domain>/<lang_code><path>
117-
109+
https://<base_domain>/<lang_code>/<path>
118110
Also sets x-default to English by default.
119111
"""
112+
113+
# Encode the path for safety
114+
encoded_path = urllib.parse.quote(path, safe="/:?=&%")
115+
120116
# We'll set x-default to the English version
121117
xdefault_link = ET.SubElement(url_element, '{http://www.w3.org/1999/xhtml}link')
122118
xdefault_link.set('rel', 'alternate')
123119
xdefault_link.set('hreflang', 'x-default')
124-
xdefault_link.set('href', f"https://{base_domain}/en/{path}")
120+
xdefault_link.set('href', f"https://{base_domain}/en/{encoded_path}")
125121

126122
# Add one <xhtml:link> for each language
127123
for lang_code in languages.values():
128124
alt_link = ET.SubElement(url_element, '{http://www.w3.org/1999/xhtml}link')
129125
alt_link.set('rel', 'alternate')
130126
alt_link.set('hreflang', lang_code)
131-
alt_link.set('href', f"https://{base_domain}/{lang_code}/{path}")
127+
alt_link.set('href', f"https://{base_domain}/{lang_code}/{encoded_path}")
132128

133129
# --------------------------------------------------------------------
134130
# 3) Main logic
@@ -147,17 +143,16 @@ def main():
147143
ET.register_namespace('xhtml', "http://www.w3.org/1999/xhtml")
148144
root = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}urlset')
149145

150-
# ----------------------------------------------------------------
151-
# 3.1) Process Book paths
152-
# ----------------------------------------------------------------
153146
print("**Processing Book paths**...")
154147
for p in tqdm(book_paths, desc="Book paths"):
155-
# Create <url> element
156148
url_element = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')
157149

158-
# Our base location for English is domain/en/path
150+
# Encode path to handle special chars like '+'
151+
encoded_path = urllib.parse.quote(p, safe="/:?=&%")
152+
153+
# Base location: domain/en/encoded_path
159154
loc_el = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
160-
full_en_url = f"https://{BOOK_DOMAIN}/en/{p}"
155+
full_en_url = f"https://{BOOK_DOMAIN}/en/{encoded_path}"
161156
loc_el.text = full_en_url
162157

163158
# Priority calculation
@@ -168,17 +163,15 @@ def main():
168163
add_translated_urls(url_element, BOOK_DOMAIN, p)
169164
root.append(url_element)
170165

171-
# ----------------------------------------------------------------
172-
# 3.2) Process Cloud paths
173-
# ----------------------------------------------------------------
174166
print("**Processing Cloud paths**...")
175167
for p in tqdm(cloud_paths, desc="Cloud paths"):
176-
# Create <url> element
177168
url_element = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')
178169

179-
# Our base location for English is domain/en/path
170+
encoded_path = urllib.parse.quote(p, safe="/:?=&%")
171+
172+
# Base location: domain/en/encoded_path
180173
loc_el = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
181-
full_en_url = f"https://{CLOUD_DOMAIN}/en/{p}"
174+
full_en_url = f"https://{CLOUD_DOMAIN}/en/{encoded_path}"
182175
loc_el.text = full_en_url
183176

184177
# Priority calculation
@@ -189,9 +182,6 @@ def main():
189182
add_translated_urls(url_element, CLOUD_DOMAIN, p)
190183
root.append(url_element)
191184

192-
# ----------------------------------------------------------------
193-
# 3.3) Write the final sitemap
194-
# ----------------------------------------------------------------
195185
print("**Generating final sitemap**...")
196186
sitemap_xml = prettify_xml(root)
197187
with open("sitemap.xml", "w", encoding="utf-8") as f:

0 commit comments

Comments
 (0)