-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathresave_idx.py
29 lines (26 loc) · 1.11 KB
/
resave_idx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
"""
Check that the indexes saved in old shard files (indexed by enumerate()) are consistent with dataframe index.
"""
import pandas as pd
from pathlib import Path
import pickle
df = pd.read_json('../4OH4/ReVeal/out/data/chrome_debian_cfg_full_text_files.json')
idx, functions = zip(*list(df.iterrows()))
print('idx is the dataframe index', df.head().index.values, idx[:len(df.head())])
# example indices used to be defined by the indices in enumerate(func_it),
# which may not match the dataframe index.
# We're lucky that we load a dataframe from json, which creates an autoincrementing index starting from 0.
func_it = enumerate(functions)
for i, f in func_it:
assert f["file_name"] == df.iloc[idx[i]]["file_name"]
shard_idx = 0
shard_filename = Path(f'new_functions.pkl.shard{shard_idx}')
while shard_filename.exists():
with open(shard_filename, 'rb') as f:
shard = pickle.load(f)
for r in shard:
i = r[0]
assert i in df.index
assert functions[i]["file_name"] == df.iloc[i]["file_name"]
shard_idx += 1
shard_filename = Path(f'new_functions.pkl.shard{shard_idx}')