forked from imclab/neuraltalk
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsolver.py
150 lines (118 loc) · 5.1 KB
/
solver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import time
import numpy as np
from imagernn.utils import randi
class Solver:
"""
solver worries about:
- different optimization methods, updates, weight decays
- it can also perform gradient check
"""
def __init__(self):
self.step_cache_ = {} # might need this
self.step_cache2_ = {} # might need this
def step(self, batch, model, cost_function, **kwargs):
"""
perform a single batch update. Takes as input:
- batch of data (X)
- model (W)
- cost function which takes batch, model
"""
learning_rate = kwargs.get('learning_rate', 0.0)
update = kwargs.get('update', model.keys())
grad_clip = kwargs.get('grad_clip', -1)
solver = kwargs.get('solver', 'vanilla')
momentum = kwargs.get('momentum', 0)
smooth_eps = kwargs.get('smooth_eps', 1e-8)
decay_rate = kwargs.get('decay_rate', 0.999)
if not (solver == 'vanilla' and momentum == 0):
# lazily make sure we initialize step cache if needed
for u in update:
if not u in self.step_cache_:
self.step_cache_[u] = np.zeros(model[u].shape)
if solver == 'adadelta':
self.step_cache2_[u] = np.zeros(model[u].shape) # adadelta needs one more cache
# compute cost and gradient
cg = cost_function(batch, model)
cost = cg['cost']
grads = cg['grad']
stats = cg['stats']
# clip gradients if needed, simplest possible version
# todo later: maybe implement the gradient direction conserving version
if grad_clip > 0:
for p in update:
if p in grads:
grads[p] = np.minimum(grads[p], grad_clip)
grads[p] = np.maximum(grads[p], -grad_clip)
# perform parameter update
for p in update:
if p in grads:
if solver == 'vanilla': # vanilla sgd, optional with momentum
if momentum > 0:
dx = momentum * self.step_cache_[p] - learning_rate * grads[p]
self.step_cache_[p] = dx
else:
dx = - learning_rate * grads[p]
elif solver == 'rmsprop':
self.step_cache_[p] = self.step_cache_[p] * decay_rate + (1.0 - decay_rate) * grads[p] ** 2
dx = -(learning_rate * grads[p]) / np.sqrt(self.step_cache_[p] + smooth_eps)
elif solver == 'adagrad':
self.step_cache_[p] += grads[p] ** 2
dx = -(learning_rate * grads[p]) / np.sqrt(self.step_cache_[p] + smooth_eps)
elif solver == 'adadelta':
self.step_cache_[p] = self.step_cache_[p] * decay_rate + (1.0 - decay_rate) * grads[p] ** 2
dx = - np.sqrt( (self.step_cache2_[p] + smooth_eps) / (self.step_cache_[p] + smooth_eps) ) * grads[p]
self.step_cache2_[p] = self.step_cache2_[p] * decay_rate + (1.0 - decay_rate) * (dx ** 2)
else:
raise Exception("solver %s not supported" % (solver, ))
# perform the parameter update
model[p] += dx
# create output dict and return
out = {}
out['cost'] = cost
out['stats'] = stats
return out
def gradCheck(self, batch, model, cost_function, **kwargs):
"""
perform gradient check.
since gradcheck can be tricky (especially with relus involved)
this function prints to console for visual inspection
"""
num_checks = kwargs.get('num_checks', 10)
delta = kwargs.get('delta', 1e-5)
rel_error_thr_warning = kwargs.get('rel_error_thr_warning', 1e-2)
rel_error_thr_error = kwargs.get('rel_error_thr_error', 1)
cg = cost_function(batch, model)
print 'running gradient check...'
for p in model.keys():
print 'checking gradient on parameter %s of shape %s...' % (p, `model[p].shape`)
mat = model[p]
s0 = cg['grad'][p].shape
s1 = mat.shape
assert s0 == s1, 'Error dims dont match: %s and %s.' % (`s0`, `s1`)
for i in xrange(num_checks):
ri = randi(mat.size)
# evluate cost at [x + delta] and [x - delta]
old_val = mat.flat[ri]
mat.flat[ri] = old_val + delta
cg0 = cost_function(batch, model)
mat.flat[ri] = old_val - delta
cg1 = cost_function(batch, model)
mat.flat[ri] = old_val # reset old value for this parameter
# fetch both numerical and analytic gradient
grad_analytic = cg['grad'][p].flat[ri]
grad_numerical = (cg0['cost']['total_cost'] - cg1['cost']['total_cost']) / ( 2 * delta )
# compare them
if grad_numerical == 0 and grad_analytic == 0:
rel_error = 0 # both are zero, OK.
status = 'OK'
elif abs(grad_numerical) < 1e-7 and abs(grad_analytic) < 1e-7:
rel_error = 0 # not enough precision to check this
status = 'VAL SMALL WARNING'
else:
rel_error = abs(grad_analytic - grad_numerical) / abs(grad_numerical + grad_analytic)
status = 'OK'
if rel_error > rel_error_thr_warning: status = 'WARNING'
if rel_error > rel_error_thr_error: status = '!!!!! NOTOK'
# print stats
print '%s checking param %s index %8d (val = %+8f), analytic = %+8f, numerical = %+8f, relative error = %+8f' \
% (status, p, ri, old_val, grad_analytic, grad_numerical, rel_error)