33
44GITHUB_TOKEN = os .getenv ("GIST_TOKEN" )
55GIST_ID = "c9112c25c5acd400b90741efa81aa411"
6+ USE_TRUE_SIZE = True
67
78g = Github (GITHUB_TOKEN )
89gist = g .get_gist (GIST_ID )
910
1011# Format the filesize to unit'ed format
1112def format_bytes (num_bytes ):
1213 units = ['B' ,'KB' ,'MB' ,'GB' ,'TB' ,'PB' ,'EB' ,'ZB' ,'YB' ]
13- factor = 1000
14+ factor = 1024
1415 unit_index = 0
1516 while num_bytes >= factor and unit_index < len (units )- 1 :
1617 num_bytes /= factor
@@ -21,7 +22,7 @@ def format_bytes(num_bytes):
2122def unformat_bytes (string ):
2223 units = ['B' ,'KB' ,'MB' ,'GB' ,'TB' ,'PB' ,'EB' ,'ZB' ,'YB' ]
2324 num ,unit = string .split (" " )
24- factor = 1000
25+ factor = 1024
2526 return float (num )* (factor ** (units .index (unit )))
2627
2728dir = "_datasets"
@@ -70,7 +71,24 @@ def unformat_bytes(string):
7071
7172 # Record all the identifiers to the filename
7273 search_map [filename ].append (f'{ username } /{ dataset } ' )
73-
74+
75+ # remove sharma kaggle model
76+ search_map ["sharma2024.md" ] = [x for x in search_map ["sharma2024.md" ] if x != 'sharmapushan/pimapnet' ]
77+ usernames ["sharmapushan" ] = [x for x in usernames ["sharmapushan" ] if x .ref != 'sharmapushan/pimapnet' ]
78+
79+ # read true dataset sizes from json file
80+ try :
81+ datasets_size = json .loads (gist .files ['datasets_size.json' ].content )
82+ print (f"Read { len (datasets_size )} dataset sizes from gist" )
83+ except Exception as e :
84+ print (f'Could not read datasets_size.json from gist: { e } ' )
85+ if os .path .exists ('datasets_size.json' ):
86+ with open ('datasets_size.json' ,'r' ) as f :
87+ datasets_size = json .load (f )
88+ print ('Loading datasets_size.json from local file...' )
89+ else :
90+ raise Exception ("No datasets_size.json found" )
91+
7492# At this point we have done all the necessary scraping from Kaggle API calls
7593for filename in search_map :
7694 dataset_names = search_map [filename ]
@@ -81,53 +99,55 @@ def unformat_bytes(string):
8199
82100 for dsn in dataset_names :
83101 print (f'Processing { dsn } ...' )
84- # Old Kaggle Api < 1.7
102+ # New Kaggle Api >= 1.7
85103 try :
86- user = dsn .split ("/" )[0 ]
87- dataset = vars (next ((d for d in usernames [user ] if vars (d )['ref' ] == dsn )))
88- downloads .append (int (dataset ['downloadCount' ]))
89- views .append (int (dataset ['viewCount' ]))
90- sizes .append (int (dataset ['totalBytes' ]))
104+ user , dataset_id = dsn .split ("/" )
105+ dataset = next ((d for d in usernames [user ] if d .ref == dsn ))
106+ downloads .append (int (dataset .download_count ))
107+ views .append (int (dataset .view_count ))
108+ if USE_TRUE_SIZE :
109+ # Use the true size from the json file
110+ if dataset_id in datasets_size .keys ():
111+ sizes .append (int (datasets_size [dataset_id ]))
112+ else :
113+ raise Exception (f"Dataset { dataset_id } not found in datasets_size.json" )
114+ else :
115+ sizes .append (int (dataset .total_bytes ))
91116 print (f'{ dsn } done.' )
92-
93- # New Kaggle Api >=1.7
94117 except KeyError :
118+ # Old Kaggle Api <1.7
95119 try :
96120 user = dsn .split ("/" )[0 ]
97- dataset = next ((d for d in usernames [user ] if d . ref == dsn ))
98- downloads .append (int (dataset . download_count ))
99- views .append (int (dataset . view_count ))
100- sizes .append (int (dataset . total_bytes ))
121+ dataset = vars ( next ((d for d in usernames [user ] if vars ( d )[ ' ref' ] == dsn ) ))
122+ downloads .append (int (dataset [ 'downloadCount' ] ))
123+ views .append (int (dataset [ 'viewCount' ] ))
124+ sizes .append (int (dataset [ 'totalBytes' ] ))
101125 print (f'{ dsn } done.' )
102-
103126 except Exception :
104127 traceback .print_exc ()
105- print (f'Error when reading { dsn } ' )
106- print (f'Continuing with 0 values...' )
128+ print (f'Error when reading { dsn } , Continuing with 0 values...' )
107129 downloads .append (0 )
108130 views .append (0 )
109131 sizes .append (0 )
110132
111133 except Exception :
112134 traceback .print_exc ()
113- print (f'Error when reading { dsn } ' )
114- print (f'Continuing with 0 values...' )
135+ print (f'Error when reading { dsn } , Continuing with 0 values...' )
115136 downloads .append (0 )
116137 views .append (0 )
117138 sizes .append (0 )
118139
119-
120140 views = np .array (views )
121141 downloads = np .array (downloads )
122142 size_in_bytes = np .array (sizes )
123143
124144 # SPECIFIC DATASET STATISTICS TO OUTPUT
125- # Take the maximum of views/downloads from each of the sub-datasets
145+ # Take the maximum of views from each of the sub-datasets
126146 # More representative than summing, since the same user would likely view multiple sub-datasets
127147 ds_size_raw = np .sum (size_in_bytes )
128148 ds_size = format_bytes (ds_size_raw )
129149 ds_views = np .max (views ) #np.sum(views)
130- ds_downs = np .max (downloads ) #np.sum (downloads)
150+ ds_downs = np .sum (downloads ) #np.max (downloads)
131151 print (f'{ filename } ({ ds_size } ) processed. { ds_views } views, { ds_downs } downloads.' )
132152
133153 if not ds_size_raw :
@@ -144,6 +164,7 @@ def unformat_bytes(string):
144164 'downloads' : ds_downs ,
145165 }
146166 json_dump [filename ] = kaggle_stats
167+ # breakpoint()
147168 total_bytes += int (np .sum (downloads * size_in_bytes ))
148169 total_size += int (np .sum (size_in_bytes ))
149170
@@ -153,7 +174,9 @@ def unformat_bytes(string):
153174
154175json_dump ['total_bytes' ] = total_bytes
155176json_dump ['total_size' ] = total_size
156-
177+ print (f'Total size: { format_bytes (total_size )} ' )
178+ print (f'Total downloaded bytes TB: { total_bytes / 1024 ** 4 } ' )
179+
157180# Update the gist
158181# Need the custom encoder class to convert numpy numbers to json readable ones
159182class NpEncoder (json .JSONEncoder ):
@@ -169,6 +192,7 @@ def default(self, obj):
169192print ('Updating {gist}...' )
170193try :
171194 gist .edit (files = {'kaggle_stats.json' : github .InputFileContent (content = json .dumps (json_dump ,indent = 4 ,cls = NpEncoder ))})
195+ gist .edit (files = {'datasets_size.json' : github .InputFileContent (content = json .dumps (datasets_size ,indent = 4 ,cls = NpEncoder ))})
172196except Exception as e :
173197 print (f'Could not update { gist } : { e } ' )
174198 print (f'Dumping to file...' )
0 commit comments