3030FILE_LANGUAGES = os .path .join (
3131 PATHS ["data_phase" ], "wikipedia_count_by_languages.csv"
3232)
33- HEADER_LANGUAGES = ["LANGUAGE_CODE" , "LANGUAGE_NAME" , "COUNT" ]
33+ HEADER_LANGUAGES = [
34+ "LANGUAGE_CODE" ,
35+ "LANGUAGE_NAME" ,
36+ "LANGUAGE_NAME_EN" ,
37+ "COUNT" ,
38+ ]
3439QUARTER = os .path .basename (PATHS ["data_quarter" ])
3540WIKIPEDIA_BASE_URL = "https://en.wikipedia.org/w/api.php"
3641WIKIPEDIA_MATRIX_URL = "https://meta.wikimedia.org/w/api.php"
@@ -91,25 +96,28 @@ def query_wikipedia_languages(session):
9196 tool_data = []
9297
9398 # Gets all language wikipedias
94- params = {"action" : "sitematrix" , "format" : "json" }
99+ params = {"action" : "sitematrix" , "format" : "json" , "uselang" : "en" }
95100 r = session .get (WIKIPEDIA_MATRIX_URL , params = params , timeout = 30 )
96101 data = r .json ()["sitematrix" ]
97102
98103 languages = []
99104 for key , val in data .items ():
105+ if not isinstance (val , dict ):
106+ continue
100107 if key .isdigit ():
101108 language_code = val .get ("code" )
102109 language_name = val .get ("name" )
103- for site in val .get ("site" , []):
104- if "wikipedia.org" in site ["url" ]:
105- languages .append (
106- {
107- "code" : language_code ,
108- "name" : language_name ,
109- "url" : site ["url" ],
110- }
111- )
112-
110+ language_name_en = val .get ("localname" )
111+ for site in val .get ("site" , []):
112+ if "wikipedia.org" in site ["url" ]:
113+ languages .append (
114+ {
115+ "code" : language_code ,
116+ "name" : language_name ,
117+ "name_en" : language_name_en ,
118+ "url" : site ["url" ],
119+ }
120+ )
113121 # For each language wikipedia, fetch statistics.
114122 for site in languages :
115123 base_url = f"{ site ['url' ]} /w/api.php"
@@ -124,24 +132,32 @@ def query_wikipedia_languages(session):
124132 r .raise_for_status ()
125133 data = r .json ()
126134 stats = data ["query" ]["statistics" ]
127-
128135 article_count = stats .get ("articles" , 0 )
136+ language_code = site ["code" ]
137+ language_name = site ["name" ]
138+ language_name_en = site ["name_en" ]
139+
140+ if language_name :
141+ language_display = (
142+ f"{ language_code } { language_name_en } ({ language_name } )"
143+ )
144+ else :
145+ language_display = f"{ language_code } { language_name_en } "
129146 if article_count == 0 :
130- LOGGER .info (f"Skipping { language_name } with 0 articles" )
147+ LOGGER .info (f"Skipping { language_display } with 0 articles" )
131148 continue
132149 tool_data .append (
133150 {
134- "LANGUAGE_CODE" : site ["code" ],
135- "LANGUAGE_NAME" : site ["name" ],
151+ "LANGUAGE_CODE" : language_code ,
152+ "LANGUAGE_NAME" : language_name ,
153+ "LANGUAGE_NAME_EN" : language_name_en ,
136154 "COUNT" : article_count ,
137155 }
138156 )
139- LOGGER .info (f"{ site [ 'code' ] } ( { site [ 'name' ] } ) : { article_count } " )
157+ LOGGER .info (f"{ language_display } : { article_count } " )
140158
141159 except Exception as e :
142- LOGGER .warning (
143- f"Failed to fetch for { site ['code' ]} ({ site ['name' ]} ): { e } "
144- )
160+ LOGGER .warning (f"Failed to fetch for { language_display } ): { e } " )
145161
146162 return tool_data
147163
0 commit comments