Skip to content

Commit

Permalink
Fix e2e dataset construction
Browse files Browse the repository at this point in the history
  • Loading branch information
pragya16067 authored Jan 18, 2023
1 parent a26194b commit 00b8bcf
Show file tree
Hide file tree
Showing 5 changed files with 257 additions and 35 deletions.
67 changes: 60 additions & 7 deletions e2e_scripts/preprocess_s2and_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@
import numpy as np
from utils.parser import Parser

from s2and.data import ANDData
import logging
from s2and.featurizer import FeaturizationInfo, featurize

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
logger = logging.getLogger(__name__)


def save_blockwise_featurized_data(dataset_name, random_seed):
parent_dir = f"{DATA_HOME_DIR}/{dataset_name}"
Expand All @@ -23,22 +31,20 @@ def save_blockwise_featurized_data(dataset_name, random_seed):
specter_embeddings=join(parent_dir, f"{dataset_name}_specter.pickle"),
clusters=join(parent_dir, f"{dataset_name}_clusters.json"),
block_type="s2",
train_pairs_size=100,
val_pairs_size=100,
test_pairs_size=100,
train_pairs_size=100000,
val_pairs_size=10000,
test_pairs_size=10000,
name=dataset_name,
n_jobs=2,
n_jobs=16,
random_seed=random_seed,
)
# Uncomment the following line if you wish to preprocess the whole dataset
AND_dataset.process_whole_dataset()

# Load the featurizer, which calculates pairwise similarity scores
featurization_info = FeaturizationInfo()
# the cache will make it faster to train multiple times - it stores the features on disk for you
train_pkl, val_pkl, test_pkl = store_featurized_pickles(AND_dataset,
featurization_info,
n_jobs=2,
n_jobs=16,
use_cache=False,
random_seed=random_seed)

Expand All @@ -53,6 +59,51 @@ def read_blockwise_features(pkl):
print("Total num of blocks:", len(blockwise_data.keys()))
return blockwise_data

def find_total_num_train_pairs(blockwise_data):
count = 0
for block_id in blockwise_data.keys():
count += len(blockwise_data[block_id][0])

print("Total num of signature pairs", count)

# def verify_diff_with_s2and(dataset_name, random_seed):
# parent_dir = f"{DATA_HOME_DIR}/{dataset_name}"
# AND_dataset = ANDData(
# signatures=join(parent_dir, f"{dataset_name}_signatures.json"),
# papers=join(parent_dir, f"{dataset_name}_papers.json"),
# mode="train",
# specter_embeddings=join(parent_dir, f"{dataset_name}_specter.pickle"),
# clusters=join(parent_dir, f"{dataset_name}_clusters.json"),
# block_type="s2",
# train_pairs_size=100,
# val_pairs_size=100,
# test_pairs_size=100,
# # train_pairs_size=100000,
# # val_pairs_size=10000,
# # test_pairs_size=10000,
# name=dataset_name,
# n_jobs=2,
# random_seed=random_seed,
# )
#
# # Load the featurizer, which calculates pairwise similarity scores
# featurization_info = FeaturizationInfo()
# # the cache will make it faster to train multiple times - it stores the features on disk for you
# train, val, test = featurize(AND_dataset, featurization_info, n_jobs=2, use_cache=False)
# X_train, y_train, _ = train
# X_val, y_val, _ = val
#
# logger.info("Done loading and featurizing")
#
# #Verify the 2 sets are equal
# with open("s2and_data_subsample.pkl", "rb") as _pkl_file:
# s2and_set = pickle.load(_pkl_file)
#
# with open("our_data_subsample.pkl", "rb") as _pkl_file:
# our_set = pickle.load(_pkl_file)
#
# print("VERIFICATION STATUS: ", s2and_set==our_set)


if __name__=='__main__':
# Creates the pickles that store the preprocessed data
Expand All @@ -77,6 +128,8 @@ def read_blockwise_features(pkl):
val_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/val_features.pkl"
test_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset}/seed{seed}/test_features.pkl"
blockwise_features = read_blockwise_features(train_pkl)
find_total_num_train_pairs(blockwise_features)
#verify_diff_with_s2and(dataset, seed)



105 changes: 105 additions & 0 deletions e2e_scripts/train_s2and_hac.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import pickle
from os.path import join
from typing import Dict, Tuple
import numpy as np
from s2and.consts import PREPROCESSED_DATA_DIR
from s2and.data import ANDData
import logging
from s2and.model import PairwiseModeler
from s2and.featurizer import FeaturizationInfo, featurize
from s2and.eval import pairwise_eval, cluster_eval
from s2and.model import Clusterer, FastCluster
from hyperopt import hp

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
logger = logging.getLogger(__name__)

def load_training_data(train_pkl, val_pkl):
blockwise_data: Dict[str, Tuple[np.ndarray, np.ndarray]]
with open(train_pkl, "rb") as _pkl_file:
blockwise_data = pickle.load(_pkl_file)
# Combine the blockwise_data to form complete train, test, val sets
remove_arr = np.zeros(39)
X_train = [remove_arr]
y_train = []
for block_data in blockwise_data.values():
x, y, cluster_ids = block_data
X_train = np.concatenate((X_train, x), axis=0)
y_train = np.concatenate((y_train, y), axis=0)
X_train = np.delete(X_train, 0)

blockwise_data_val: Dict[str, Tuple[np.ndarray, np.ndarray]]
with open(val_pkl, "rb") as _pkl_file:
blockwise_data_val = pickle.load(_pkl_file)
# Combine the blockwise_data to form complete train, test, val sets
X_val = [remove_arr]
y_val = []
for block_data in blockwise_data_val.values():
x, y, cluster_ids = block_data
X_val = np.concatenate((X_val, x), axis=0)
y_val = np.concatenate((y_val, y), axis=0)
X_val = np.delete(X_val, 0)

logger.info("Dataset loaded and prepared for training")

# dataset = ANDData(
# signatures=join(parent_dir, f"{dataset_name}_signatures.json"),
# papers=join(parent_dir, f"{dataset_name}_papers.json"),
# mode="train",
# specter_embeddings=join(parent_dir, f"{dataset_name}_specter.pickle"),
# clusters=join(parent_dir, f"{dataset_name}_clusters.json"),
# block_type="s2",
# name=dataset_name,
# n_jobs=4,
# )
# logger.info("loaded the data")

# Training Featurizer model
featurization_info = FeaturizationInfo()

logger.info("Done loading and featurizing")
return featurization_info, X_train, y_train, X_val, y_val

def train_pairwise_classifier(featurization_info, X_train, y_train, X_val, y_val):
# calibration fits isotonic regression after the binary classifier is fit
# monotone constraints help the LightGBM classifier behave sensibly
pairwise_model = PairwiseModeler(
n_iter=25, monotone_constraints=featurization_info.lightgbm_monotone_constraints
)
# this does hyperparameter selection, which is why we need to pass in the validation set.
pairwise_model.fit(X_train, y_train, X_val, y_val)
logger.info("Fitted the Pairwise model")

# this will also dump a lot of useful plots (ROC, PR, SHAP) to the figs_path
pairwise_metrics = pairwise_eval(X_val, y_val, pairwise_model.classifier, figs_path='figs/', title='validation_metrics')
logger.info(pairwise_metrics)
return pairwise_model

def train_HAC_clusterer(dataset_name, featurization_info, pairwise_model):
clusterer = Clusterer(
featurization_info,
pairwise_model,
cluster_model=FastCluster(linkage="average"),
search_space={"eps": hp.uniform("eps", 0, 1)},
n_iter=25,
n_jobs=8,
)
clusterer.fit(dataset_name)

# the metrics_per_signature are there so we can break out the facets if needed
metrics, metrics_per_signature = cluster_eval(dataset_name, clusterer)
logger.info(metrics)


if __name__=='__main__':
dataset_name = "pubmed"
dataset_seed = 1
parent_dir = f"../data/{dataset_name}"
train_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset_name}/seed{dataset_seed}/train_features.pkl"
val_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset_name}/seed{dataset_seed}/val_features.pkl"
test_pkl = f"{PREPROCESSED_DATA_DIR}/{dataset_name}/seed{dataset_seed}/test_features.pkl"

featurization_info, X_train, y_train, X_val, y_val = load_training_data(train_pkl, val_pkl)
pairwise_model = train_pairwise_classifier(featurization_info, X_train, y_train, X_val, y_val)
train_HAC_clusterer(dataset_name, featurization_info, pairwise_model)
Loading

0 comments on commit 00b8bcf

Please sign in to comment.