Skip to content

Commit fda007c

Browse files
committed
Made requested changes
1 parent efc1fc6 commit fda007c

File tree

2 files changed

+37
-22
lines changed

2 files changed

+37
-22
lines changed

scripts/1-fetch/wikipedia_fetch.py

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,12 @@
3030
FILE_LANGUAGES = os.path.join(
3131
PATHS["data_phase"], "wikipedia_count_by_languages.csv"
3232
)
33-
HEADER_LANGUAGES = ["LANGUAGE_CODE", "LANGUAGE_NAME", "COUNT"]
33+
HEADER_LANGUAGES = [
34+
"LANGUAGE_CODE",
35+
"LANGUAGE_NAME",
36+
"LANGUAGE_NAME_EN",
37+
"COUNT",
38+
]
3439
QUARTER = os.path.basename(PATHS["data_quarter"])
3540
WIKIPEDIA_BASE_URL = "https://en.wikipedia.org/w/api.php"
3641
WIKIPEDIA_MATRIX_URL = "https://meta.wikimedia.org/w/api.php"
@@ -91,25 +96,28 @@ def query_wikipedia_languages(session):
9196
tool_data = []
9297

9398
# Gets all language wikipedias
94-
params = {"action": "sitematrix", "format": "json"}
99+
params = {"action": "sitematrix", "format": "json", "uselang": "en"}
95100
r = session.get(WIKIPEDIA_MATRIX_URL, params=params, timeout=30)
96101
data = r.json()["sitematrix"]
97102

98103
languages = []
99104
for key, val in data.items():
105+
if not isinstance(val, dict):
106+
continue
100107
if key.isdigit():
101108
language_code = val.get("code")
102109
language_name = val.get("name")
103-
for site in val.get("site", []):
104-
if "wikipedia.org" in site["url"]:
105-
languages.append(
106-
{
107-
"code": language_code,
108-
"name": language_name,
109-
"url": site["url"],
110-
}
111-
)
112-
110+
language_name_en = val.get("localname")
111+
for site in val.get("site", []):
112+
if "wikipedia.org" in site["url"]:
113+
languages.append(
114+
{
115+
"code": language_code,
116+
"name": language_name,
117+
"name_en": language_name_en,
118+
"url": site["url"],
119+
}
120+
)
113121
# For each language wikipedia, fetch statistics.
114122
for site in languages:
115123
base_url = f"{site['url']}/w/api.php"
@@ -124,24 +132,32 @@ def query_wikipedia_languages(session):
124132
r.raise_for_status()
125133
data = r.json()
126134
stats = data["query"]["statistics"]
127-
128135
article_count = stats.get("articles", 0)
136+
language_code = site["code"]
137+
language_name = site["name"]
138+
language_name_en = site["name_en"]
139+
140+
if language_name:
141+
language_display = (
142+
f"{language_code} {language_name_en} ({language_name})"
143+
)
144+
else:
145+
language_display = f"{language_code} {language_name_en}"
129146
if article_count == 0:
130-
LOGGER.info(f"Skipping {language_name} with 0 articles")
147+
LOGGER.info(f"Skipping {language_display} with 0 articles")
131148
continue
132149
tool_data.append(
133150
{
134-
"LANGUAGE_CODE": site["code"],
135-
"LANGUAGE_NAME": site["name"],
151+
"LANGUAGE_CODE": language_code,
152+
"LANGUAGE_NAME": language_name,
153+
"LANGUAGE_NAME_EN": language_name_en,
136154
"COUNT": article_count,
137155
}
138156
)
139-
LOGGER.info(f"{site['code']} ({site['name']}): {article_count}")
157+
LOGGER.info(f"{language_display}: {article_count}")
140158

141159
except Exception as e:
142-
LOGGER.warning(
143-
f"Failed to fetch for {site['code']} ({site['name']}): {e}"
144-
)
160+
LOGGER.warning(f"Failed to fetch for {language_display}): {e}")
145161

146162
return tool_data
147163

scripts/shared.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,13 @@
99

1010
STATUS_FORCELIST = [
1111
408, # Request Timeout
12-
422, # Unprocessable Content (Validation failed, or endpoint spammed)
12+
422, # Unprocessable Content (Validation failed,endpoint spammed, etc.)
1313
429, # Too Many Requests
1414
500, # Internal Server Error
1515
502, # Bad Gateway
1616
503, # Service Unavailable
1717
504, # Gateway Timeout
1818
]
19-
2019
USER_AGENT = (
2120
"QuantifyingTheCommons/1.0 "
2221
"(https://github.com/creativecommons/quantifying)"

0 commit comments

Comments
 (0)