-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathAL.py
494 lines (448 loc) · 19.5 KB
/
AL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
import numpy as np
import copy
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
######################################################
##### Utility Funcs #####
######################################################
def prepare_models(desc_array, y_array, num_models,
n_estimators=5, max_depth=1, random_state=42):
''' Prepares differently initiated models.
Parameters
----------
desc_array, y_array : np.2d/1darrays
num_models : int
number of models to prepare.
other arguments : hyperparameters of random forest classifier.
Returns
-------
model_list : list of RandomForestClassifiers.
'''
model_list = []
for i in range(num_models):
rfc = RandomForestClassifier(random_state=random_state+i,
n_estimators=n_estimators,
max_depth=max_depth)
rfc.fit(desc_array, y_array)
model_list.append(rfc)
return model_list
def concat_source_and_batch(source_X, source_y,
target_batch_X, target_batch_y,
sample_weight_array,
weight_factor):
''' Concatenates input arrays, prepares sample_weight_array
required for .fit(). All outputs will be used as input for
next round.
Parameters
----------
source_X, source_y : np.2d/1darrays
Source arrays
target_batch_X, target_batch_y : np.2d/1darrays
Arrays of target reactions that were collected this iteration.
sample_weight_array : list of ints
Source:1, target:some integer
weight_factor: int
Sample weight for newly collected target data.
Outputs
-------
new_X, new_y : np.2d/1darray
Concatenated X, y arrays
sample_weight_array : list
Concatenated weight_array
'''
new_X = np.vstack(tuple([source_X]+[target_batch_X]))
new_y = np.concatenate(tuple([source_y]+[target_batch_y]))
sample_weight_array += [weight_factor]*len(target_batch_y)
return new_X, new_y, sample_weight_array
def divide_target_arrays(X_desc_target, X_id_target, y_target,
idx_rxn_to_run, remaining_idx):
''' Splits the target array after simulating experiment conduction.
Parameters
----------
X_desc/id/y_target : np.2darray
Arrays to split
idx_rxn_to_run : list
Indices of reactions in which exp has been conducted
remaining_idx : list
Indices of reactions that has not yet been experimented.
Returns
-------
rxns_collected, rxns_remaining : tuple of np.ndarrays
'''
#print("inds collected: ", idx_rxn_to_run)
desc_collected = X_desc_target[idx_rxn_to_run,:]
id_collected = X_id_target[idx_rxn_to_run,:]
y_collected = y_target[idx_rxn_to_run]
desc_remaining = X_desc_target[remaining_idx,:]
id_remaining = X_id_target[remaining_idx,:]
y_remaining = y_target[remaining_idx]
assert desc_collected.shape[0]+desc_remaining.shape[0]==X_desc_target.shape[0]
rxns_collected = (desc_collected, id_collected, y_collected)
rxns_remaining = (desc_remaining, id_remaining, y_remaining)
return rxns_collected, rxns_remaining
######################################################
##### Choosing Rxns #####
######################################################
def most_confident_rxns(model, X_target_desc, num_rxns):
''' Chooses the reactions to conduct based on highest confidence.
Parameters
----------
model : classifier
Current model.
X_target_desc : np.2darray
Remaining desc/id target reaction candidates
num_rxns : int
Number of reactions to select.
Returns
-------
idx_rxn_to_run : np.1darray
Indices of rxns to run within X_target_desc
remaining_idx : np.1darray
Indices of rxns remaining within X_target_desc
confidence : np.1darray
Predicted probability values of selected reactions.
'''
full_pred_proba = model.predict_proba(X_target_desc)
if full_pred_proba.shape[1] > 1:
pred_proba = full_pred_proba[:,1]
else :
print("Only one label predicted", full_pred_proba)
pred_proba = full_pred_proba
confidence = 1-np.sort(pred_proba)[:num_rxns]
idx_rxn_to_run = np.argsort(pred_proba)[:num_rxns]
remaining_idx = np.argsort(pred_proba)[num_rxns:]
return idx_rxn_to_run, remaining_idx, confidence
def highest_variance_btw_trees(model, X_target_desc, num_rxns):
''' Chooses reactions which have highest variance of predicted
probability values across all trees in the RF model.
Parameters
----------
model : classifier
Current model.
X_target_desc : np.2darray
Target descriptor array.
num_rxns : int
Number of reactions to select.
Returns
-------
idx_rxn_to_run : np.1darray
Indices of rxns to run within X_target_desc
remaining_idx : np.1darray
Indices of rxns remaining within X_target_desc
confidence : np.1darray
Predicted probability values of selected reactions.
'''
all_proba = np.zeros((X_target_desc.shape[0],
len(model.estimators_)))
for i,tree in enumerate(model.estimators_):
all_proba[:,i] = tree.predict_proba(X_target_desc)[:,0]
var_proba = np.var(all_proba, axis=1)
pred_proba = model.predict_proba(X_target_desc)[:,1]
idx_rxn_to_run = np.argsort(var_proba)[-num_rxns:]
remaining_idx = np.argsort(var_proba)[:-num_rxns]
return idx_rxn_to_run, remaining_idx, 1-pred_proba[idx_rxn_to_run]
def highest_expectation_plus_std(model, X_target_desc, num_rxns,
coeff):
''' Follows upper confidence bound. Selects reactions with
highest predicted probability values + std across trees.
Parameters
----------
model : classifier
X_target_desc : np.2darray
Target descriptor array.
num_rxns : int
Number of reactions to select.
coeff : float
For upper confidence bound, coefficient of standard deviation.
(Coefficient of average is 1.)
Returns
-------
idx_rxn_to_run : np.1darray
Indices of rxns to run within X_target_desc
remaining_idx : np.1darray
Indices of rxns remaining within X_target_desc
pred_proba : np.1darray
Predicted probability values of selected reactions.
'''
pred_proba = model.predict_proba(X_target_desc)[:,0]
all_proba = np.zeros((X_target_desc.shape[0],
len(model.estimators_)))
for i,tree in enumerate(model.estimators_):
all_proba[:,i] = tree.predict_proba(X_target_desc)[:,0]
std_proba = np.std(all_proba, axis=1)
pred_proba = model.predict_proba(X_target_desc)[:,0]
comb = pred_proba + coeff*std_proba
idx_rxn_to_run = np.argsort(comb)[-num_rxns:]
remaining_idx = np.argsort(comb)[:-num_rxns]
return idx_rxn_to_run, remaining_idx, pred_proba
######################################################
##### Updating Model #####
######################################################
def replace_trees(model, new_X, new_y,
newly_conducted_X, newly_conducted_y,
sample_weight_array,
max_depth, random_state):
'''Within the random forest, replaces trees that predicted
at least one result of the conducted experiments incorrectly.
Parameters
----------
new_X, new_y : np.ndarrays
Arrays of the combined source+collected target data.
newly_conducted_X/y : np.ndarrays
Arrays of the collected target data only.
sample_weight_array : list
Sample weights for training new model
max_depth : int
Maximum depth of new tree
random_state : int
Returns
-------
model : random forest classifier
'''
tree_inds_to_replace = []
for i,tree in enumerate(model.estimators_):
pred_by_tree = tree.predict(newly_conducted_X)
if len(np.where(pred_by_tree!=newly_conducted_y)[0])!=0:
tree_inds_to_replace.append(i)
print(f" Replacing {len(tree_inds_to_replace)} Trees.")
if len(tree_inds_to_replace) > 0:
if len(tree_inds_to_replace)==1:
dtc = DecisionTreeClassifier(random_state=random_state,
max_features="sqrt",
max_depth=max_depth)
dtc.fit(new_X, new_y, sample_weight=sample_weight_array)
model.estimators_[tree_inds_to_replace[0]]=dtc
else :
rfc = RandomForestClassifier(
random_state=random_state,
max_depth=max_depth,
n_estimators = len(tree_inds_to_replace)
)
rfc.fit(new_X, new_y, sample_weight=sample_weight_array)
for i, ind in enumerate(tree_inds_to_replace) :
model.estimators_[ind] = rfc.estimators_[i]
return model
def add_trees(model, new_X, new_y, sample_weight_array,
num_trees, max_depth, random_state):
'''Adds specified number of trees while not changing
the original model.
Parameters
----------
model : classifier
Current model.
new_X, new_y : np.ndarrays
Arrays of the reactions to train new trees on.
sample_weight_array : list
Sample weights for training new model
num_trees : int
Number of trees to add to the forest.
max_depth : int
Maximum depth of new tree
random_state : int
Returns
-------
model : random forest classifier
'''
rfc = RandomForestClassifier(
random_state=random_state,
max_depth=max_depth,
n_estimators=num_trees,
)
rfc.fit(new_X, new_y, sample_weight=sample_weight_array)
copied_model = copy.deepcopy(model)
copied_model.estimators_ += rfc.estimators_
copied_model.n_estimators+= rfc.n_estimators
return copied_model
######################################################
##### Actual Iterative Active Learning #####
######################################################
def explore_target_in_batches(
source_model, X_desc_source, y_source,
X_desc_target, X_id_target, y_target,
rxn_selection_strategy,
model_update_strategy, add_every_iter=True,
num_trees_to_add=0, new_max_depth=1,
num_rxns_per_batch=3, weight_factor=3,
enough_found=8, random_state=42, coeff=None,
print_progress=True
):
''' Simulates exploration by conducting experiments selected
by model in batches.
Parameters
----------
source_model : classifier
Model to be initially used.
X_desc_source, y_source : np.2d/1darrays
Descriptor (input) and yield label (output) arrays of source data.
X_desc_target, X_id_target, y_target : np.2d/2d/1darrays
Descriptor (input), ID (to easily track reactions) and
yield label (output) arrays of target data.
rxn_selection_strategy : {"confidence", "confusion", "variance"}
or a list with these elements
Strategy to sample reactions to conduct.
model_update_strategy : {"replace", "add_all", "add_collected", "new"}
• replace : replaces trees that predicts 'collected rxns' wrong.
• add_all : adds new trees that are trained on combined data.
--> should set 'num_trees_to_add' value to be greater than 0.
• add_collected : adds new trees that are trained only on the newly collected data.
--> should set 'num_trees_to_add' value to be greater than 0.
• new : train new RF
• none: use same model
add_every_iter : bool
whether new trees are added every iteration.
• True : number of trees will be [5,8,11,14,•••] if num_trees_to_add=3
• False : number of trees will be [5,8,8,8,8,•••] if num_trees_to_add=3
num_trees_to_add : int
if model_update_strategy=="add" :
number of trees to add each iteration.
if model_update_strategy=="new" :
new model will be trained with previous model's
number of trees + num_trees_to_add
new_max_depth : int or list
• int : fixes the max_depth value across iterations
• list : changes max_depth value as iteration progresses.
num_rxns_per_batch : int
number of reactions to sample each batch.
weight_factor : int
weight each collected target data will have,
compared to source data when being combined
enough_found : int
after finding this number of reactions, stop exploring
random_state : int
random seed for modeling etc.
coeff : float
if upper confidence bound is considered, use as coefficient of standard deviation.
print_progress : Bool
whether to print information on sampled reactions each batch.
Returns
-------
rxns_collected_per_batch : list of tuples
(X_id_array, X_desc_array, y_array) per each iteration.
confidence_selected_rxns : list of np.1darray
confidence on selected reactions' desiredness for every iteration.
model_by_iter : list of classifiers
models used to select the reactions to conduct each iter.
num_found_by_batch : list of ints
cumulative number of desired reactions found after every iteration.
'''
### 0) Initiation
rxns_collected_per_batch = []
confidence_selected_rxns = []
model_by_iter = []
num_found_by_batch = [0]
total_num_found = 0
sample_weight_array = [1]*len(y_source)
num_trees = len(source_model.estimators_)
iteration = 1
ref_source_model = copy.deepcopy(source_model)
ref_source_desc = copy.deepcopy(X_desc_source)
while total_num_found < enough_found :
if print_progress:
print("-----------"+"-"*len(str(iteration)))
print(f"Iteration {iteration}")
print("-----------"+"-"*len(str(iteration)))
### 1) Reaction Selection Phase
if type(rxn_selection_strategy) == str:
rxn_selection = rxn_selection_strategy
elif type(rxn_selection_strategy) == list:
rxn_selection = rxn_selection_strategy[iteration-1]
if rxn_selection=="confidence":
idx_rxn_to_run, remaining_idx, conf = most_confident_rxns(
source_model, X_desc_target, num_rxns_per_batch
)
elif rxn_selection=="variance":
idx_rxn_to_run, remaining_idx, conf = highest_variance_btw_trees(
source_model, X_desc_target, num_rxns_per_batch
)
elif rxn_selection=="ucb":
idx_rxn_to_run, remaining_idx, conf = highest_expectation_plus_std(
source_model, X_desc_target, num_rxns_per_batch, coeff)
else :
print("Inappropriate reaction selection strategy.")
break
### 1-1) Update items to return
#print("Before dividing", idx_rxn_to_run)
rxns_collected, rxns_remaining = divide_target_arrays(
X_desc_target, X_id_target, y_target,
idx_rxn_to_run, remaining_idx
)
rxns_collected_per_batch.append(rxns_collected)
confidence_selected_rxns.append(conf)
num_found = len(np.where(y_target[idx_rxn_to_run]==0)[0]) # 0 because negative is minor for our dataset
total_num_found += num_found
num_found_by_batch.append(total_num_found)
if print_progress:
print(np.hstack((rxns_collected[1][:,2:],
rxns_collected[2].reshape(-1,1))))
print(f"Found {num_found} rxns.")
print()
### 1-2) Update arrays and sample_weight
if weight_factor != 0:
#print("Collected shape:", rxns_collected[0].shape)
X_desc_source, y_source, sample_weight_array = concat_source_and_batch(
X_desc_source, y_source,
rxns_collected[0], rxns_collected[2],
sample_weight_array, weight_factor)
(X_desc_target, X_id_target, y_target) = rxns_remaining
### 2) Update Model
if weight_factor!=0:
if type(new_max_depth) == int:
max_depth = new_max_depth
elif type(new_max_depth) == list:
max_depth = new_max_depth[iteration-1]
if model_update_strategy=="replace":
source_model = replace_trees(
source_model, X_desc_source, y_source,
rxns_collected[0], rxns_collected[2],
sample_weight_array,
max_depth, random_state+iteration
)
elif model_update_strategy=="add_all":
if add_every_iter:
source_model = add_trees(
source_model, X_desc_source, y_source,
sample_weight_array, num_trees_to_add,
max_depth, random_state+iteration
)
else:
source_model = add_trees(
ref_source_model, X_desc_source, y_source,
sample_weight_array, num_trees_to_add,
max_depth, random_state+iteration
)
#print("Number of trees: ", source_model.n_estimators)
elif model_update_strategy=="add_collected":
if add_every_iter:
source_model = add_trees(
source_model, rxns_collected[0], rxns_collected[2],
[1]*len(rxns_collected[2]), num_trees_to_add,
max_depth, random_state+iteration
)
else :
#print("Number of RXNS to train on: ", len(y_source[ref_source_desc.shape[0]:]))
source_model = add_trees(
ref_source_model,
X_desc_source[ref_source_desc.shape[0]:,:],
y_source[ref_source_desc.shape[0]:],
[1]*len(y_source[ref_source_desc.shape[0]:]), num_trees_to_add,
max_depth, random_state+iteration
)
#print("Number of trees: ", source_model.n_estimators)
elif model_update_strategy=="new":
rfc = RandomForestClassifier(random_state=random_state+iteration,
max_depth=max_depth,
n_estimators=num_trees+num_trees_to_add)
rfc.fit(X_desc_source, y_source,
sample_weight=sample_weight_array)
source_model = rfc
elif model_update_strategy=="none":
source_model = source_model
else :
print("Inappropriate model update strategy.")
break
model_by_iter.append(copy.deepcopy(source_model))
iteration+= 1
#print(f"Source: {X_desc_source.shape[0]}, Target: {X_desc_target.shape[0]}")
return rxns_collected_per_batch, confidence_selected_rxns,\
model_by_iter, num_found_by_batch