-
-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathskimage_filter_dependents.py
346 lines (304 loc) · 10.4 KB
/
skimage_filter_dependents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
"""
At the time this script was created (July 2020), GitHub did not offer an
official way to query dependent packages through their API. So, we went
for a web-scraping approach using BeautifulSoup, patterned after a response
in this stack-overflow thread:
https://stackoverflow.com/questions/58734176/how-to-use-github-api-to-get-a-repositorys-dependents-information-in-github
To retrieve topic lists via the GitHub API, the user must have defined a
GITHUB_TOKEN environment variable.
This script generates three lists of packages:
1.) One that has ALL dependents that are active repositories (i.e. no "Ghost"
icon in the web page).
2.) Another one that only retains packages with >= min_stars stars, but also
includes a list of the GitHub "topics" associated with each package.
3.) A third list that is based on filtering the second list. During filtering,
a package is retained if either:
a.) Any string from repo_name_terms is in the repository organization/name
b.) A topic in the repo's topic list matches a topic in topic_search_terms
The three variables containing the lists described above are:
Outputs
-------
all_packages : list of tuple
Each element is a (name, forks, stars) tuple.
popular_packages : list of tuple
Each element is a (name, forks, stars, topics) tuple.
popular_filtered_packages : list, tuple
Each element is a (name, forks, stars, topics) tuple.
"""
import os
import pickle
from bs4 import BeautifulSoup
from github import Github
import pandas
import requests
# we use PyGitHub to retrieve topic lists
token = os.environ['GITHUB_TOKEN']
g = Github(token)
# ----------------------------------
# START OF USER-CONFIGURABLE OPTIONS
# ----------------------------------
# The repository we will query (whose dependents we want to find)
repo_to_query = "scikit-image/scikit-image"
# Retrieve detailed topic lists only for packages with >= min_stars stars.
min_stars = 5
# If True, will write the three lists to .pickle files in the current directory
save_to_pickle = False
# If True, will write the three lists to .csv files in the current directory
save_to_csv = True
# Search terms of interest in the repository organization/name.
# (see description at top)
# All terms should be in lower case.
repo_name_terms = [
'brain',
'cell',
'ecg',
'eeg',
'medi',
'mri',
'neuro',
'pathol',
'retin',
'slide',
'spectro',
'tissue',
'tomo',
]
# Search terms of interest in the repository's topics (see description at top).
# This list was created to match bio-image applications by manually curating
# topic names from the full list of dependent packages.
topic_search_terms = [
'airways',
'anatomy',
'arteries',
'astrocytes',
'atomic-force-microscopy',
'afm',
'axon',
'bioimage-informatics',
'bioinformatics',
'biologists',
'biomedical-image-processing',
'bionic-vision',
'biophysics',
'brain-connectivity',
'brain-imaging',
'brain-mri',
'brain-tumor-segmentation',
'brats',
'calcium',
'cancer-research',
'cell-biology',
'cell-detection',
'cell-segmentation',
'computational-pathology',
'connectome',
'connectomics',
'cryo-em',
'ct-data',
'deconvolution-microscopy',
'dicom',
'dicom-rt',
'digital-pathology-data',
'digital-pathology',
'digital-slide-archive',
'dmri',
'electron-microscopy',
'electrophysiology',
'fluorescence',
'fluorescence-microscopy-imaging',
'fmri',
'fmri-preprocessing',
'functional-connectomes',
'healthcare-imaging',
'histology',
'voxel',
'microorganism-colonies',
'microscopy',
'microscopy-images',
'neuroimaging',
'medical',
'medical-image-computing',
'medical-image-processing',
'medical-images',
'medical-imaging',
'mri',
'myelin',
'neural-engineering',
'neuroanatomy',
'neuroimaging',
'neuroimaging-analysis',
'neuropoly',
'neuroscience',
'nih-brain-initiative',
'openslide',
'pathology',
'pathology-image',
'radiation-oncology',
'radiation-physics',
'raman',
'retinal-implants',
'scanning-probe-microscopy',
'scanning-tunnelling-microscopy',
'single-cell-imaging',
'slide-images',
'spectroscopy',
'spinalcord',
'stm',
'stem',
'stitching',
'structural-connectomes',
'tissue-localization',
'tomography',
'volumetric-images',
'whole-slide-image',
'whole-slide-imaging',
]
# Omit the following repositories from the filtered list.
# These match at least one of the search terms above, but do not appear to be
# biology-focused. (e.g. the term "cell" appears in "Marcello").
omit_list = [
'Marcello-Sega/pytim',
'PMEAL/porespy'
]
# --------------------------------
# END OF USER-CONFIGURABLE OPTIONS
# --------------------------------
# Parse at most this many web pages.
# Parsing should automatically stop when reaching the last page.
max_page_num = 100
packages = True
url = ('https://github.com/{}/network/dependents'
'?dependent_type=PACKAGE').format(repo_to_query)
package_list = []
ghost_list = []
prev_len = 0
for i in range(max_page_num):
# retrieve HTML for the current URL
print("GET " + url)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
page_package_list = []
page_ghost_list = []
for t in soup.findAll("div", {"class": "Box-row"}):
try:
# find repository org/name
name = "{}/{}".format(
t.find('a', {"data-repository-hovercards-enabled": ""}).text,
t.find('a', {"data-hovercard-type": "repository"}).text
)
except AttributeError:
# Ghost repositories will give None for the find() calls above.
# This results in an AttributeError when trying to access .text
page_ghost_list.append(t.text)
continue
# extract the number of stars
stars = 'unknown'
for span in t.find_all('span', attrs={'class': 'text-gray-light'}):
svg_star = span.find_all('svg', attrs={'class': 'octicon-star'})
if svg_star:
# replace ","" in e.g. "1,000" before casting to int
stars = int(span.text.strip().replace(",", ""))
break
# extract the number of forks
forks = 'unknown'
for span in t.find_all('span', attrs={'class': 'text-gray-light'}):
svg_fork = span.find_all('svg',
attrs={'class': 'octicon-repo-forked'})
if svg_fork:
# replace ","" in e.g. "1,000" before casting to int
forks = int(span.text.strip().replace(",", ""))
break
page_package_list.append((name, forks, stars))
# append packages from the current page to the overall lists
package_list = package_list + page_package_list
ghost_list = ghost_list + page_ghost_list
# remove any duplicates
package_list = list(set(package_list))
ghost_list = list(set(ghost_list))
# terminate if no change from the prior URL
new_len = len(package_list) + len(ghost_list)
if new_len == prev_len:
print("no change in package lists... stopping scraping")
break
prev_len = new_len
# find the URL for the "Next" page of packages
paginationContainers = soup.find(
"div", {"class": "paginate-container"}).find_all('a')
url = None
for paginationContainer in paginationContainers:
# Make sure we are retrieving the "Next" page and not the "Previous"
if paginationContainer.text == "Next":
url = paginationContainer["href"]
if url is None:
print("No additional next page found, ... stopping scraping")
break
# sort by descending number of stars
# This is the first list mentioned at the top.
all_packages = sorted(package_list, key=lambda x: x[2], reverse=True)
# Create the second list by retaining only those with >= min_stars
# Note that in the package list, the tuple is:
# (name, # of forks, # of stars)
_popular_packages = [p for p in all_packages if p[2] >= min_stars]
n_popular = len(_popular_packages)
# add a 4th term to each tuple, containing the GitHub topic list
popular_packages = []
for n, p in enumerate(_popular_packages):
print("Retrieving topics for package {} of {}".format(n + 1, n_popular))
repo_name = p[0]
repo = g.get_repo(repo_name)
topics = repo.get_topics()
popular_packages.append(p + (topics,))
print("Applying filtering")
popular_filtered_packages = []
for p in popular_packages:
name = p[0]
name_lower = name.lower()
if name in omit_list:
continue
topics = p[3]
keep = False # unless we match a term below, we will exclude the package
# check match based on repository organization/name
for m in repo_name_terms:
if m in name_lower:
keep = True
break
# If not already a match, search based on topic search terms
if not keep:
for topic in topics:
if topic in topic_search_terms:
keep = True
break
if keep:
popular_filtered_packages.append(p)
# dump output lists to pickle
fname_base = repo_to_query.replace('/', '_')
if save_to_pickle:
print("Writing pickle files")
os.chdir('/media/lee8rx/data/Dropbox/Dropbox/Grants/CZI')
with open(fname_base + '_all_packages.pickle', 'wb') as f:
pickle.dump(all_packages, f)
with open(fname_base + '_popular_packages.pickle', 'wb') as f:
pickle.dump(popular_packages, f)
with open(fname_base + '_popular_filtered_packages.pickle', 'wb') as f:
pickle.dump(popular_filtered_packages, f)
if save_to_csv:
print("Writing CSV files")
df_all = pandas.DataFrame(
all_packages,
columns=('name', '# of forks', '# of stars')
)
df_all = df_all.set_index('name')
df_all.to_csv(fname_base + '_all_dependents.csv')
df_popular = pandas.DataFrame(
popular_packages,
columns=('name', '# of forks', '# of stars', 'topics')
)
df_popular = df_popular.set_index('name')
df_popular.to_csv(fname_base + '_popular_dependents.csv')
df_filtered_popular = pandas.DataFrame(
popular_filtered_packages,
columns=('name', '# of forks', '# of stars', 'topics')
)
df_filtered_popular = df_filtered_popular.set_index('name')
df_filtered_popular.to_csv(fname_base + '_filtered_dependents.csv')
# print(df_filtered_popular.to_markdown())