-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsettings.py
194 lines (170 loc) · 12.5 KB
/
settings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import os
import pathlib
import time
import argparse
import re
from utils import get_model_type
import pdb
project_root = os.path.join(pathlib.Path(__file__).parent.resolve())
def str2bool(val):
if isinstance(val, bool):
return val
if val.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif val.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Expected boolean value.')
def sorted_alphanumeric(data):
convert = lambda text: int(text) if text.isdigit() else text.lower()
alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
return sorted(data, key=alphanum_key)
def parse_arguments():
argparser = argparse.ArgumentParser()
# Main parameters
argparser.add_argument("--mode", type=str, default='pretrain',
help='Mode of experiment (pretrain, rewrite, downstream).')
argparser.add_argument("--dataset", type=str, required=True,
help='mode=pretrain: Which dataset will be used for training the autoencoder. '
'mode=rewrite: Which dataset will be rewritten. '
'mode=downstream: Which dataset the downstream task will be trained on.')
# 'Can include multiple datasets from the huggingface datasets library by specifying: '
# 'huggingface_multiple_dataset1_dataset2')
argparser.add_argument("--model", type=str, default='adept',
help="Model to run, must match the specified mode (see list of models in README). (Currently only have 'adept' for pre-training/rewriting and 'bert_downstream' for downstream classification.)")
# optional, general params: meta params for running experiments
argparser.add_argument("--name", type=str, default=None,
help='The experiment name. Defaults to the current timestamp.')
argparser.add_argument("--experiment", type=str, default=None,
help='The type of experiment to be run, can either choose from an existing set available in the project, a custom name for a custom experiment, or left as None, in which case the default experiment pipeline will be run.') # maybe define 'default' in more detail
argparser.add_argument("--seed", type=int, default=12345)
argparser.add_argument("--test_only", type=str2bool, nargs='?',
const=True, default=False,
help='Only test on saved downstream model '
'according to specified configuration '
'(only applies to mode=downstream).'
'If you set this to true, the experiment name and results directory need to have a checkpoint available to load from.')
argparser.add_argument('--local', type=str2bool, nargs='?', const=True,
default=False, help='Run the experiment in a'
'limited setting for local'
'debugging (with CPU and limited'
'iteration size, defined with'
'"local_iter_size" argument).')
argparser.add_argument('--local_iter_size', type=int, default=10,
help='Determines number of iterations per epoch'
'when running experiments locally.')
# optional, general params: directories
argparser.add_argument("--output_dir", type=str, default=os.path.join(project_root, 'results'),
help='Where stats & logs will be saved (a subfolder will be created for '
'the experiment). Defaults to <project_root>/results')
argparser.add_argument("--dump_dir", type=str, default=None,
help='Where stuff that might need much storage (e.g., model checkpoints & rewritten data) '
'will be saved. Defaults to output_dir.')
argparser.add_argument("--asset_dir", type=str, default=os.path.join(project_root, 'assets'),
help='Where to look for assets like data sets & embeddings. '
'Defaults to <project_root>/assets. '
'For data sets, this can be overwritten with the data_dir argument.')
argparser.add_argument("--data_dir", type=str, default=None,
help='Where to look for data sets. Defaults to asset_dir')
argparser.add_argument("--embed_dir_unprocessed", type=str, default=None,
help='Where to look for pre-trained embedding models (e.g. Word2Vec, GloVe).')
argparser.add_argument("--custom_train_path", type=str, default=None, help='Where to look for a custom datasets (train partition), if not using one of the prepared datasets for the framework.')
argparser.add_argument("--custom_valid_path", type=str, default=None, help='Where to look for a custom datasets (optional validation partition), if not using one of the prepared datasets for the framework. If not supplied for pre-training, validation set will be created from a random split of the train partition.')
argparser.add_argument("--custom_test_path", type=str, default=None, help='Where to look for a custom datasets (optional test partition), if not using one of the prepared datasets for the framework.')
argparser.add_argument("--downstream_test_data", type=str, default=None, help='Will determine a test set to load from one of the datasets of the framework (downstream mode only, when loading a training/validation dataset from custom paths).')
# optional, general params: datasets
argparser.add_argument("--prepend_labels", type=str2bool, nargs='?',
const=True, default=False,
help='Whether to prepend labels to the beginning'\
'of each sequence.')
argparser.add_argument("--train_ratio", type=float, default=0.8,
help='Training dataset size ratio for train/validation split (pretrain/downstream modes only). Not used if custom dataset specified with a path for the validation set.')
argparser.add_argument("--length_threshold", type=int, default=None,
help='An optional value by which to subsample a dataset to only include document that are originally below a certain number of tokens (based on a split by whitespace for each document).')
argparser.add_argument("--custom_preprocessor", type=str2bool, nargs='?',
const=True, default=False, help='Whether to use a custom preprocessor instead of one of the built-in ones.')
argparser.add_argument("--last_checkpoint_path", type=str, default='',
help='Global path of the checkpoint that should be used. '
'Pretrain mode will use this to resume training. '
'Rewrite mode will load the model used for rewriting from that checkpoint.')
argparser.add_argument("--include_original", type=str2bool, nargs='?', const=True, default=False, help='Whether or not to include the original dataset in the rewritten dataframe (rewrite mode only).')
# optional, general params: hyperparams
argparser.add_argument("--epochs", type=int, default=5)
argparser.add_argument("--batch_size", type=int, default=1)
argparser.add_argument("--learning_rate", type=float, default=0.001)
argparser.add_argument("--weight_decay", type=float, default=0.00)
argparser.add_argument("--early_stopping", type=str2bool, nargs='?',
const=True, default=True)
argparser.add_argument("--patience", type=int, default=20)
argparser.add_argument("--optim_type", type=str, default='adam',
help='sgd or adam')
argparser.add_argument("--two_optimizers", type=str2bool, nargs='?',
const=True, default=True, help='Requires for the specified model to have distinct "encoder" and "decoder" components.')
argparser.add_argument("--hidden_size", type=int, default=768)
argparser.add_argument("--enc_out_size", type=int, default=128,
help='Specific to RNN models.')
argparser.add_argument("--vocab_size", type=int, default=20000,
help='Specific to RNN models, specifies the maximum vocabulary size based on frequency when using a pre-trained word embedding model.')
argparser.add_argument("--max_seq_len", type=int, default=512)
argparser.add_argument("--embed_size", type=int, default=300)
argparser.add_argument("--embed_type", type=str, default='glove',
help='"glove" or "word2vec"')
argparser.add_argument("--transformer_type", type=str, default='bert-base-uncased')
argparser.add_argument("--train_teacher_forcing_ratio", type=float,
default=0.0, help='For RNN-based models.')
argparser.add_argument("--private", type=str2bool, nargs='?',
const=True, default=False,
help='If privatization should be applied during pre-training')
argparser.add_argument("--epsilon", type=float, default=1)
argparser.add_argument("--delta", type=float, default=1e-5)
argparser.add_argument("--clipping_constant", type=float, default=1.)
argparser.add_argument("--save_initial_model", type=str2bool, nargs='?', const=True, default=False, help='Whether to save a checkpoint of the model before starting the training procedure (pre-training mode only). For convenient comparison of rewriting capabilities for models that were not locally pre-trained. Model will not be saved if loading from an existing checkpoint.')
argparser.add_argument("--dp_mechanism", type=str, default='laplace', help='laplace or gaussian.'
'Has no effect as of now.')
argparser.add_argument("--dp_module", type=str, default='clip_norm', help='The type of DP module to be applied, specific to certain autoencoder models. Relevant arguments to be specified for each specific module.')
argparser.add_argument("--l_norm", type=int, default=2, help='Pass 2 for L2 norm, 1 for L1 norm.'
'Original adept uses the L2-norm (default value of this param) '
'(which is one of the reasons why Adept is not DP, cf. '
'http://dx.doi.org/10.18653/v1/2021.emnlp-main.114). Used in the "clip_norm" dp_module')
argparser.add_argument("--no_clipping", type=str2bool, nargs='?', const=True, default=False, help='Whether or not to clip encoder hidden states in the non-private setting.')
argparser.add_argument("--custom_model_arguments", nargs='*', help='Additional optional arguments for a custom model, no upper limit on the number.')
args = argparser.parse_args()
return args
class Settings(object):
'''
Configuration for the project.
'''
def __init__(self, args):
# args, e.g. the output of settings.parse_arguments()
self.args = args
now = time.localtime()
self.args.formatted_now = f'{now[0]}-{now[1]}-{now[2]}--{now[3]:02d}-{now[4]:02d}-{now[5]:02d}'
## Determining model type
self.args.model_type = get_model_type(self.args.model)
if self.args.name is None or self.args.name == '':
self.args.name = self.args.formatted_now
if self.args.dump_dir is None:
self.args.dump_dir = self.args.output_dir
if self.args.data_dir is None:
self.args.data_dir = self.args.asset_dir
self.exp_output_dir = os.path.join(self.args.output_dir, self.args.name)
self.exp_dump_dir = os.path.join(self.args.dump_dir, self.args.name)
self.checkpoint_dir = os.path.join(self.exp_dump_dir, 'checkpoints')
self.embed_dir_processed = None
def make_dirs(self):
for d in [
self.args.output_dir,
self.args.dump_dir,
self.args.asset_dir,
self.args.data_dir,
self.exp_output_dir,
self.exp_dump_dir,
self.checkpoint_dir
]:
if not os.path.exists(d):
os.makedirs(d)
if self.args.model_type == 'rnn':
self.embed_dir_processed = os.path.join(self.args.asset_dir,
'embeds')
if not os.path.exists(self.embed_dir_processed):
os.makedirs(self.embed_dir_processed)