forked from svs192219/ner
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadagrad_trainer.py
70 lines (63 loc) · 3.43 KB
/
adagrad_trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# adagrad_trainer.py
from utils import *
import numpy as np
# Wrapper for using AdaGrad as the optimizer. AdagradTrainer wraps a weight vector and applies the custom
# AdaGrad update using second moments of features to make custom step sizes. This version incorporates L1
# regularization: while this regularization should be applied to squash the feature vector on every gradient update,
# we instead evaluate the regularizer lazily only when the particular feature is touched (either by gradient update
# or by access). approximate lets you turn this off for faster access, but regularization is now applied
# somewhat inconsistently.
class AdagradTrainer(object):
def __init__(self, init_weights, lamb, eta, approximate=False):
self.weights = init_weights
self.lamb = lamb
self.eta = eta
self.approximate = approximate
self.curr_iter = 0
self.last_iter_touched = [0 for i in xrange(0, self.weights.shape[0])]
self.diag_Gt = np.zeros_like(self.weights, dtype=float)
# Take a sparse representation of the gradient and make an update, normalizing by the batch size to keep
# hyperparameters constant as the batch size is varied
def apply_gradient_update(self, gradient, batch_size):
batch_size_multiplier = 1.0 / batch_size
self.curr_iter += 1
for i in gradient.keys():
xti = self.weights[i]
# N.B.We negate the gradient here because the Adagrad formulas are all for minimizing
# and we're trying to maximize, so think of it as minimizing the negative of the objective
# which has the opposite gradient
# Equation (25) in http://www.cs.berkeley.edu / ~jduchi / projects / DuchiHaSi10.pdf
# eta is the step size, lambda is the regularization
gti = -gradient.get_count(i) * batch_size_multiplier
old_eta_over_Htii = self.eta / (1 + np.sqrt(self.diag_Gt[i]))
self.diag_Gt[i] += gti * gti
Htii = 1 + np.sqrt(self.diag_Gt[i])
eta_over_Htii = self.eta / Htii
new_xti = xti - eta_over_Htii * gti
# Apply the regularizer for every iteration since touched
iters_since_touched = self.curr_iter - self.last_iter_touched[i]
self.last_iter_touched[i] = self.curr_iter
self.weights[i] = np.sign(new_xti) * max(0, np.abs(new_xti) - self.lamb * eta_over_Htii - (iters_since_touched - 1) * self.lamb * old_eta_over_Htii)
# Get the weight of feature i
def access(self, i):
if not self.approximate and self.last_iter_touched[i] != self.curr_iter:
xti = self.weights[i]
Htii = 1 + np.sqrt(self.diag_Gt[i])
eta_over_Htii = self.eta / Htii
iters_since_touched = self.curr_iter - self.last_iter_touched[i]
self.last_iter_touched[i] = self.curr_iter
self.weights[i] = np.sign(xti) * max(0, np.abs(xti) - iters_since_touched * self.lamb * self.eta * eta_over_Htii);
return self.weights[i]
# Score a feature vector
def score(self, feats):
i = 0
score = 0.0
while i < len(feats):
score += self.access(feats[i])
i += 1
return score
# Return the final weight vector values -- manually calls access to force each weight to have an updated value.
def get_final_weights(self):
for i in xrange(0, self.weights.shape[0]):
self.access(i)
return self.weights