@@ -17,7 +17,7 @@ def extract_kaggle(file_path):
1717 kaggle_identifiers = [f"{ username } /{ dataset } " for username , dataset in matches ]
1818 return kaggle_identifiers
1919
20- dir = "_datasets"
20+ dir = "blastnet.github.io/ _datasets"
2121total_bytes = 0
2222total_size = 0
2323json_dump = {}
@@ -75,25 +75,33 @@ def format_bytes(num_bytes):
7575 num_bytes /= factor
7676 unit_index += 1
7777 return f"{ num_bytes :.3f} { units [unit_index ]} "
78+ def unformat_bytes (string ):
79+ units = ['B' ,'KB' ,'MB' ,'GB' ,'TB' ,'PB' ,'EB' ,'ZB' ,'YB' ]
80+ num ,unit = string .split (" " )
81+ factor = 1000
82+ return float (num )* (factor ** (units .index (unit )))
7883
7984 # SPECIFIC DATASET STATISTICS TO OUTPUT
8085 # Take the maximum of views/downloads from each of the sub-datasets
8186 # More representative than summing, since the same user would likely view multiple sub-datasets
82- ds_size = format_bytes (np .sum (size_in_bytes ))
87+ ds_size_raw = np .sum (size_in_bytes )
88+ ds_size = format_bytes (ds_size_raw )
8389 ds_views = np .max (views ) #np.sum(views)
8490 ds_downs = np .max (downloads ) #np.sum(downloads)
8591 print (f'{ filename } ({ ds_size } ) processed. { ds_views } views, { ds_downs } downloads.' )
8692
87- # Use old data as fallback
88- if not ds_size :
93+ if not ds_size_raw :
94+ # Use old data as fallback
8995 kaggle_stats = json .loads (gist .files ['kaggle_stats.json' ].content )
9096 kaggle_stats = kaggle_stats [filename ]
91-
92- # Save as dictionary and throw it to the preamble
93- kaggle_stats = {
94- 'size' : ds_size ,
95- 'views' : ds_views ,
96- 'downloads' : ds_downs ,
97+ size_in_bytes = unformat_bytes (kaggle_stats ['size' ])
98+ downloads = kaggle_stats ['downloads' ]
99+ else :
100+ # Save as dictionary and throw it to the preamble
101+ kaggle_stats = {
102+ 'size' : ds_size ,
103+ 'views' : ds_views ,
104+ 'downloads' : ds_downs ,
97105 }
98106 json_dump [filename ] = kaggle_stats
99107 total_bytes += int (np .sum (downloads * size_in_bytes ))
@@ -102,6 +110,9 @@ def format_bytes(num_bytes):
102110if not total_bytes :
103111 raise Exception ("Zero data encountered, exiting" )
104112 exit ()
113+ #old_data = json.loads(gist.files['kaggle_stats.json'].content)
114+ #total_bytes = old_data['total_bytes']
115+ #total_size = old_data['total_size']
105116
106117json_dump ['total_bytes' ] = total_bytes
107118json_dump ['total_size' ] = total_size
0 commit comments