3
3
from xml .dom import minidom
4
4
from tqdm import tqdm
5
5
import re
6
+ import urllib .parse
6
7
7
8
# --------------------------------------------------------------------
8
9
# 1) Definitions & Constants
@@ -57,28 +58,20 @@ def parse_paths_from_summary(summary_text):
57
58
58
59
Returns a list of unique paths (without duplicates).
59
60
"""
60
- # Regex to find standard Markdown links: [some text](some/path)
61
- # Capture everything inside parentheses after the bracket, ignoring any leading/trailing spaces.
62
61
pattern = r"\[[^\]]+\]\(\s*([^)]+?)\s*\)"
63
62
matches = re .findall (pattern , summary_text )
64
63
65
64
cleaned_paths = []
66
65
for path in matches :
67
- # Trim whitespace just in case
68
66
path = path .strip ()
69
67
70
68
# 1) Handle /README.md -> /index.html
71
- # (anywhere in the path, not just the very end, but typically it should be at the end)
72
69
if path .endswith ("README.md" ):
73
70
path = path [:- 9 ] + "index.html"
74
-
75
71
# 2) Else if it ends with .md -> .html
76
72
elif path .endswith (".md" ):
77
73
path = path [:- 3 ] + ".html"
78
74
79
- # You asked NOT to remove /index or trailing slashes
80
- # so we won't do any extra trimming beyond that.
81
-
82
75
# Avoid duplicates
83
76
if path not in cleaned_paths :
84
77
cleaned_paths .append (path )
@@ -113,22 +106,25 @@ def prettify_xml(element):
113
106
def add_translated_urls (url_element , base_domain , path ):
114
107
"""
115
108
Add translated URLs with language codes, e.g.:
116
- https://<base_domain>/<lang_code><path>
117
-
109
+ https://<base_domain>/<lang_code>/<path>
118
110
Also sets x-default to English by default.
119
111
"""
112
+
113
+ # Encode the path for safety
114
+ encoded_path = urllib .parse .quote (path , safe = "/:?=&%" )
115
+
120
116
# We'll set x-default to the English version
121
117
xdefault_link = ET .SubElement (url_element , '{http://www.w3.org/1999/xhtml}link' )
122
118
xdefault_link .set ('rel' , 'alternate' )
123
119
xdefault_link .set ('hreflang' , 'x-default' )
124
- xdefault_link .set ('href' , f"https://{ base_domain } /en/{ path } " )
120
+ xdefault_link .set ('href' , f"https://{ base_domain } /en/{ encoded_path } " )
125
121
126
122
# Add one <xhtml:link> for each language
127
123
for lang_code in languages .values ():
128
124
alt_link = ET .SubElement (url_element , '{http://www.w3.org/1999/xhtml}link' )
129
125
alt_link .set ('rel' , 'alternate' )
130
126
alt_link .set ('hreflang' , lang_code )
131
- alt_link .set ('href' , f"https://{ base_domain } /{ lang_code } /{ path } " )
127
+ alt_link .set ('href' , f"https://{ base_domain } /{ lang_code } /{ encoded_path } " )
132
128
133
129
# --------------------------------------------------------------------
134
130
# 3) Main logic
@@ -147,17 +143,16 @@ def main():
147
143
ET .register_namespace ('xhtml' , "http://www.w3.org/1999/xhtml" )
148
144
root = ET .Element ('{http://www.sitemaps.org/schemas/sitemap/0.9}urlset' )
149
145
150
- # ----------------------------------------------------------------
151
- # 3.1) Process Book paths
152
- # ----------------------------------------------------------------
153
146
print ("**Processing Book paths**..." )
154
147
for p in tqdm (book_paths , desc = "Book paths" ):
155
- # Create <url> element
156
148
url_element = ET .Element ('{http://www.sitemaps.org/schemas/sitemap/0.9}url' )
157
149
158
- # Our base location for English is domain/en/path
150
+ # Encode path to handle special chars like '+'
151
+ encoded_path = urllib .parse .quote (p , safe = "/:?=&%" )
152
+
153
+ # Base location: domain/en/encoded_path
159
154
loc_el = ET .SubElement (url_element , '{http://www.sitemaps.org/schemas/sitemap/0.9}loc' )
160
- full_en_url = f"https://{ BOOK_DOMAIN } /en/{ p } "
155
+ full_en_url = f"https://{ BOOK_DOMAIN } /en/{ encoded_path } "
161
156
loc_el .text = full_en_url
162
157
163
158
# Priority calculation
@@ -168,17 +163,15 @@ def main():
168
163
add_translated_urls (url_element , BOOK_DOMAIN , p )
169
164
root .append (url_element )
170
165
171
- # ----------------------------------------------------------------
172
- # 3.2) Process Cloud paths
173
- # ----------------------------------------------------------------
174
166
print ("**Processing Cloud paths**..." )
175
167
for p in tqdm (cloud_paths , desc = "Cloud paths" ):
176
- # Create <url> element
177
168
url_element = ET .Element ('{http://www.sitemaps.org/schemas/sitemap/0.9}url' )
178
169
179
- # Our base location for English is domain/en/path
170
+ encoded_path = urllib .parse .quote (p , safe = "/:?=&%" )
171
+
172
+ # Base location: domain/en/encoded_path
180
173
loc_el = ET .SubElement (url_element , '{http://www.sitemaps.org/schemas/sitemap/0.9}loc' )
181
- full_en_url = f"https://{ CLOUD_DOMAIN } /en/{ p } "
174
+ full_en_url = f"https://{ CLOUD_DOMAIN } /en/{ encoded_path } "
182
175
loc_el .text = full_en_url
183
176
184
177
# Priority calculation
@@ -189,9 +182,6 @@ def main():
189
182
add_translated_urls (url_element , CLOUD_DOMAIN , p )
190
183
root .append (url_element )
191
184
192
- # ----------------------------------------------------------------
193
- # 3.3) Write the final sitemap
194
- # ----------------------------------------------------------------
195
185
print ("**Generating final sitemap**..." )
196
186
sitemap_xml = prettify_xml (root )
197
187
with open ("sitemap.xml" , "w" , encoding = "utf-8" ) as f :
0 commit comments