forked from meryemmhamdi1/x-continuous-learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser_args.py
329 lines (244 loc) · 20 KB
/
parser_args.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
def add_path_arguments(parser):
path_params = parser.add_argument_group("Path Parameters")
path_params.add_argument("--data_root", help="Root directory of the dataset.",
type=str, default="")
path_params.add_argument("--model_root", help="Path to the root directory hosting the trans model, if offline.",
type=str, default="")
path_params.add_argument("--out_dir", help="The root directory of the results for this project.",
type=str, default="")
path_params.add_argument("--stats_file", help="Filename of the stats file.", # TODO CHECK WHAT THIS DOES EXACTLY
type=str, default="stats.txt")
path_params.add_argument("--log_file", help="Filename of the log file.", # TODO DO PROPER CHECKPOINTING
type=str, default="log.txt")
path_params.add_argument("--param_tune_idx", help="Index of the tuning hyperparameters.", type=str, default="0")
def add_setup_arguments(parser):
setup_params = parser.add_argument_group("Setup Scenarios Parameters")
setup_params.add_argument("--setup_opt", help="The different setup scenarios to pick from:"
"* cil: Cross-CIL with fixed LL. "
"* cil-other: Incremental version of cil where previous intents' "
" subtasks are added in addition to other labels for"
" subsequent intents'subtasks."
"* cll: Cross-LL with fixed CIL."
"* cil-ll: Cross CIL and CLL mixed."
"* multi-incr-cil: Weaker version of Multi-Task Learning, where we"
" gradually fine-tune on the accumulation of "
" different subtasks."
"* multi-incr-cll: Weaker version of Multilingual Learning, where we"
" gradually fine-tune on the accumulation of "
" different languages."
"* multi: Multi-tasking one model on all tasks and languages.",
choices=["cil", "cil-other", "multi-incr-cil",
"cll", "multi-incr-cll", "cil-ll", "multi", "multi-equal",
"cll-er_kd", "cll-equal", "cll-equal-er_kd", "cll-n-ways", "cll-k-shots"],
type=str, default="cll")
setup_params.add_argument("--cil_stream_lang", help="Which lang to work on for the CIL setup if it is picked.",
default="en")
setup_params.add_argument("--order_class", help="Different ways of ordering the classes:"
"* 0: high2lowclass: decreasing order (from high to low-resource)."
"* 1: low2highclass: increasing order (from low to high-resource)."
"* 2: randomclass: random order.",
type=int, default=0)
setup_params.add_argument("--order_lang", help="Different ways of ordering the languages:"
"* 0: high2lowlang: decreasing order (from high to low-resource)."
"* 1: low2highlang: increasing order (from low to high-resource)."
"* 2: randomlang: random order.",
type=int, default=0)
setup_params.add_argument("--order_lst", help="Specific order for subtasks and languages: list of languages "
"or subtasks.",
type=str, default="")
setup_params.add_argument("--setup_cillia", help="Different ways of ordering mixture of both cll and cil:"
"* intents: traversing subtasks horizontally over all intent "
" classes first then to languages."
"* langs: traversing subtasks vertically over all languages first"
" then to classes.",
type=str, default="intents")
setup_params.add_argument('--random_pred', help="Whether to predict directly the random initialization of the model"
"when tested directly on the languages without any fine-tuning.",
action="store_true")
def add_dataset_arguments(parser):
dataset_params = parser.add_argument_group("Dataset Options")
dataset_params.add_argument("--data_format", help="Whether it is tsv (MTOD), json, or txt (MTOP).",
type=str, default="txt")
dataset_params.add_argument("--data_name", help="Whether it is mtop or atis.",
type=str, default="mtop", choices=["atis", "mtop"])
dataset_params.add_argument("--languages", help="Train languages list.",
nargs="+", default=["de", "en", "es", "fr", "hi", "th"])
dataset_params.add_argument("--num_intent_tasks", help="The number of intent per task.",
type=int, default=10)
dataset_params.add_argument("--num_lang_tasks", help="The number of lang per task.",
type=int, default=2)
def add_base_model_arguments(parser):
base_model_params = parser.add_argument_group("Base Model Parameters")
base_model_params.add_argument("--trans_model", help="Name of the Transformer encoder model.",
type=str, default="BertBaseMultilingualCased",
choices=["BertBaseMultilingualCased", "BertLarge", "BertBaseCased",
"Xlnet_base", "Xlnet_large", "XLM", "DistilBert_base",
"DistilBert_large", "Roberta_base", "Roberta_large",
"XLMRoberta_base", "XLMRoberta_large", "ALBERT-base-v1",
"ALBERT-large-v1", "ALBERT-xlarge-v1", "ALBERT-xxlarge-v1",
"ALBERT-base-v2", "ALBERT-large-v2", "ALBERT-xlarge-v2",
"ALBERT-xxlarge-v2"])
base_model_params.add_argument("--use_slots", help="If true, optimize for slot filling loss too.",
action="store_true")
base_model_params.add_argument("--use_mono", help="Whether to train monolingually.",
action="store_true")
base_model_params.add_argument("--epochs", help="The total number of epochs.",
type=int, default=10)
base_model_params.add_argument("--dev_steps", help="The total number of epochs to evaluate the model on the dev.",
type=int, default=200) # TODO DEV IS EVALUATED ON ONLY AFTER EACH EPOCH
base_model_params.add_argument("--test_steps", help="The total number of epochs to evaluate the model on the test.",
type=int, default=200) # TODO THIS IS NOT USED CONSISTENTLY
base_model_params.add_argument("--batch_size", help="The total number of epochs for the model to evaluate.",
type=int, default=32)
base_model_params.add_argument("--adam_lr", help="The learning rate for Adam Optimizer.",
type=float, default=1e-03)
base_model_params.add_argument("--adam_eps", help="Epsilon for the Adam Optimizer.",
type=float, default=1e-08)
base_model_params.add_argument("--beta_1", help="Beta_1 for the Adam Optimizer.",
type=float, default=0.9)
base_model_params.add_argument("--beta_2", help="Beta_2 for the Adam Optimizer.",
type=float, default=0.99)
base_model_params.add_argument("--step_size", help="The step size for the scheduler.",
type=int, default=7)
base_model_params.add_argument("--gamma", help="Gamma for the scheduler.",
type=float, default=0.1)
base_model_params.add_argument("--seed", help="Random Seed.",
type=int, default=42)
def add_model_expansion_arguments(parser):
model_expansion_params = parser.add_argument_group("Model Expansion Options")
model_expansion_params.add_argument("--multi_head_in", help="Whether to use multiple heads "
"that would imply multiple subtask/language-specific "
"heads at the input level.",
action="store_true")
model_expansion_params.add_argument("--emb_enc_subtask_spec", help="Which layer in the embeddings or the encoder "
"to tune for each subtask/language"
" independently.",
# choices=["embeddings",
# "encoder.layer.0.",
# "encoder.layer.1.",
# "encoder.layer.2.",
# "encoder.layer.3.",
# "encoder.layer.4.",
# "encoder.layer.5.",
# "encoder.layer.6.",
# "encoder.layer.7.",
# "encoder.layer.8.",
# "encoder.layer.9.",
# "encoder.layer.10.",
# "encoder.layer.11.",
# "pooler",
# "all"],
type=str, default="all")
model_expansion_params.add_argument("--multi_head_out", help="Whether to use multiple heads in the outputs that "
"would imply the use of different task-specific "
"layers.",
action="store_true")
model_expansion_params.add_argument("--use_adapters", help="whether to use adapters.",
action="store_true")
model_expansion_params.add_argument("--use_pretrained_adapters", help="Whether to use pre-trained adapters",
action="store_true")
model_expansion_params.add_argument("--adapter_type", help="Which adapter to use.",
type=str, default="MADX", choices=["Houlsby", "MADX"])
model_expansion_params.add_argument("--adapter_layers", help="List of layers to which adapters are applied.",
type=str, default="0_1_2_3_4_5")
def add_freezing_arguments(parser):
freezing_params = parser.add_argument_group("Freezing Options")
freezing_params.add_argument("--freeze_trans", help="Whether to freeze all layers in Transformer encoder/embed.",
action="store_true")
freezing_params.add_argument("--freeze_first", help="Whether to freeze from the first subtask/language.",
action="store_true")
freezing_params.add_argument("--freeze_linear", help="Whether to freeze all task-specific layers.",
action="store_true")
def cont_learn_arguments(parser):
## CONTINUAL LEARNING ALGORITHMS
cont_learn_params = parser.add_argument_group("Continuous Learning Options")
cont_learn_params.add_argument("--cont_learn_alg", help="vanilla fine-tuning or some continuous learning algorithm:"
"(ewc, gem, mbpa, metambpa, etc) or vanilla if no specific"
"continuous learning algorithm is used.",
choices=["vanilla", "ewc", "gem", "er", "mbpa", "kd-logits", "kd-rep", "reptile-er"],
# TODO to be covered next "mbpa", "metambpa", "icarl", "xdg", "si", "lwf", "gr", "rtf", "er"
type=str, default="vanilla")
cont_learn_params.add_argument("--cont_comp", help="Which component(s) in the model to focus on while learning "
"during regularization or replay",
nargs="+", default=["trans intent slot"])
### for optimization (ewc, ewc online)
cont_learn_params.add_argument("--old_task_prop", help="The percentage of old tasks used in regularization "
"or replay.",
type=float, default=0.1)
cont_learn_params.add_argument("--ewc_lambda", help="If ewc: lambda for regularization in ewc.",
type=int, default=20)
cont_learn_params.add_argument("--use_online", help="If ewc: Whether to use the online version of EWC or not.",
action='store_true')
cont_learn_params.add_argument("--gamma_ewc", help="If ewc: The percentage of decay.",
type=int, default=0.01)
## for memory replay (er, mbpa, mbpa++) mbpa++ is automatically the case if sampling_type == random
cont_learn_params.add_argument("--max_mem_sz", help="The maximum size of the memory to be used in replay",
type=int, default=60000)
cont_learn_params.add_argument("--storing_type", help="The method used to store memory examples.",
choices=["reservoir", "ring", "k-means", "mof"],
type=str, default="ring")
cont_learn_params.add_argument("--sampling_type", help="The method used to sample memory examples.",
choices=["random", "near_n"],
type=str, default="random")
cont_learn_params.add_argument("--sampling_k", help="The number of examples to be sampled.",
type=int, default=60000)
cont_learn_params.add_argument("--adaptive_epochs", help="The number of adaptive epochs for which the model is "
"to be trained on the retrieved batch in case of MbPA",
type=int, default=5) # between 1 and 20 in the original paper
cont_learn_params.add_argument("--adaptive_adam_lr", help="The learning rate for Adaptive Adam Optimizer"
"in the case of MbPA/MbPA++.",
type=float, default=1e-03) # between 0.0 and 1.0 in the original paper
cont_learn_params.add_argument("--beta", help="beta in the regularization of the adaptative loss",
type=float, default=0.001)
### for optimization + memory (gem, agem)
cont_learn_params.add_argument("--use_a_gem", help="If gem: whether to use averaged gem.",
action="store_true")
cont_learn_params.add_argument("--a_gem_n", help="If gem: The number of examples in the averaged memory.",
type=int, default=100)
def add_checkpoint_arguments(parser):
checkpointing_params = parser.add_argument_group("Checkpointing/logging Parameters")
checkpointing_params.add_argument("--verbose", help="If true, return golden labels and predictions to console.",
action="store_true")
checkpointing_params.add_argument("--save_dev_pred", help="If true, save the dev predictions.",
action="store_true")
checkpointing_params.add_argument("--save_test_every_epoch", help="If true, save test at the end of each epoch.",
action="store_true")
checkpointing_params.add_argument("--save_change_params", help="If true, save test at the end of each epoch.",
action="store_true")
checkpointing_params.add_argument("--no_debug", help="If true, save training and testing logs to disk.",
action="store_true")
checkpointing_params.add_argument("--save_model", help="Whether to save the model after training.",
action="store_true")
def add_meta_learning_setup(parser):
meta_learning_params = parser.add_argument_group("Meta-learning Parameters")
meta_learning_params.add_argument("--alpha_reptile", help="alpha lr in the inner loop in reptile",
type=float, default=0.001)
meta_learning_params.add_argument("--beta_reptile", help="beta lr in the first outer loop reptile",
type=float, default=0.01)
meta_learning_params.add_argument("--gamma_reptile", help="gamma lr in the second outer loop reptile",
type=float, default=0.01)
meta_learning_params.add_argument("--num_batches_reptile", help="Number of batches per task",
type=int, default=10)
meta_learning_params.add_argument("--use_reptile", help="Whether to use reptile or not",
action="store_true")
meta_learning_params.add_argument("--use_batches_reptile", help="Whether to use many batches per task or not",
action="store_true")
def add_spaced_repetition_setup(parser):
spaced_repetition_params = parser.add_argument_group("Spaced Repetition Parameters")
spaced_repetition_params.add_argument("--use_processor_sharing", help="Whether to use the processor sharing service "
"discipline or first-in-first-out",
action="store_true")
spaced_repetition_params.add_argument("--evaluate_one_batch", help="Whether to use one batch to update the leitner queues",
action="store_true")
spaced_repetition_params.add_argument("--eval_sched_freq",
help="How frequently should we evaluate and update the Leitner Scheduler",
type=int, default=10)
spaced_repetition_params.add_argument("--warm_start_epochs",
help="How many epochs should the rote training be",
type=int, default=2)
spaced_repetition_params.add_argument("--use_leitner_queue",
help="Whether to just use Leitner queues or just a random baseline",
action="store_true")
spaced_repetition_params.add_argument("--demote_to_first_deck",
help="Whether to demote all the way to the first deck or just to the previous one ",
action="store_true")