Skip to content

Commit 711dbe8

Browse files
committed
Did some work on backprop for the recurrent layer. Weight/bias update still missing, but now it can run without crashing
1 parent d88166f commit 711dbe8

File tree

3 files changed

+204
-66
lines changed

3 files changed

+204
-66
lines changed

project_2/config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ testing_ratio: 0.1
99
verbose: False
1010

1111
learning_rate: 0.01
12-
batch_size: 3
12+
batch_size: 32
1313
epochs: 3
1414

1515
# List of neurons in each layer on the format [input, hidden, ..., hidden, output] of

project_2/layers.py

+117-14
Original file line numberDiff line numberDiff line change
@@ -142,9 +142,7 @@ def __init__(self, neuron_count, neurons_in_previous_layer, activation_function,
142142
self.verbose = verbose
143143
self.name = name
144144

145-
# Cache all activations and inputs because we need it to do backpropagation
146-
self.inputs_history = []
147-
self.activations_history = []
145+
self.init_new_sequence()
148146

149147
def forward_pass(self, X):
150148
"""
@@ -175,19 +173,21 @@ def forward_pass(self, X):
175173

176174
return activations
177175

178-
def backward_pass(self, R):
176+
def backward_pass(self, R, sequence_step, last_sequence):
179177
"""
180178
Performs backward pass over one layer.
181179
182180
Args
183181
R: np.ndarray of shape (batch_size, neurons_in_this_layer)
182+
TODO
184183
185184
Returns
186185
np.ndarray of shape (batch_size, neurons_in_previous_layer), where neurons_in_previous_layer is
187186
the neuron count of the layer to the left (i.e., the input to this layer).
188187
"""
189188
if self.activation_function == ActivationFunction.SOFTMAX:
190-
activations = self.activations.T # (batch_size, neurons)
189+
# (batch_size, neurons_in_this_layer)
190+
activations = self.activations_history[sequence_step].T
191191

192192
batch_size = activations.shape[0]
193193
for b in range(batch_size):
@@ -207,22 +207,46 @@ def backward_pass(self, R):
207207

208208
return R
209209

210-
activation_gradient = derivative_activation_function(
211-
self.activation_function, self.activations).T
212-
R *= activation_gradient
210+
# (batch_size, neurons_in_this_layer)
211+
activation_gradients = derivative_activation_function(
212+
self.activation_function, self.activations_history[sequence_step]).T
213+
R *= activation_gradients
213214

214215
# Gradients for weights and bias
215216
batch_size = R.shape[0]
216217
# Divide by batch_size to get the average gradients over the batch
217218
# The average works because matrix multiplication sums the gradients
218-
gradient_weights = np.matmul(self.inputs, R) / batch_size
219+
220+
# (neurons_in_previous_layer, batch_size) @ (batch_size, neurons_in_this_layer)
221+
gradient_weights = np.matmul(
222+
self.inputs_history[sequence_step], R) / batch_size
219223
gradient_bias = R.sum(axis=0, keepdims=True).T / batch_size
220224

221-
self.weights -= self.learning_rate * gradient_weights
222-
self.bias -= self.learning_rate * gradient_bias
225+
if self.gradient_weights is None:
226+
self.gradient_weights = gradient_weights
227+
else:
228+
self.gradient_weights += gradient_weights # Accumulate the gradients as we go
229+
230+
if self.gradient_bias is None:
231+
self.gradient_bias = gradient_bias
232+
else:
233+
self.gradient_bias += gradient_bias # Accumulate the gradients as we go
234+
235+
if last_sequence:
236+
# Update parameters on the last sequence
237+
self.weights -= self.learning_rate * self.gradient_weights
238+
self.bias -= self.learning_rate * self.gradient_bias
223239

240+
# (batch_size, neurons_in_this_layer) @ (neurons_in_this_layer, neurons_in_previous_layer)
224241
return np.matmul(R, self.weights.T)
225242

243+
def init_new_sequence(self):
244+
# Cache all activations and inputs because we need it to do backpropagation
245+
self.inputs_history = []
246+
self.activations_history = []
247+
self.gradient_weights = None
248+
self.gradient_bias = None
249+
226250
def __str__(self):
227251
return "{} neurons with {} as activation function".format(self.neuron_count, self.activation_function)
228252

@@ -244,6 +268,8 @@ def __init__(self, neuron_count, neurons_in_previous_layer, activation_function,
244268
self.recurrent_weights = init_weights_with_range(
245269
initial_weight_ranges[0], initial_weight_ranges[1], neuron_count, neuron_count)
246270

271+
self.output_jacobian = None
272+
247273
def forward_pass(self, X):
248274
"""
249275
Args:
@@ -275,18 +301,95 @@ def forward_pass(self, X):
275301

276302
return activations
277303

278-
def backward_pass(self, R):
304+
def backward_pass(self, R, sequence_step, last_sequence):
279305
"""
280306
Performs backward pass over one layer.
281307
282308
Args
283309
R: np.ndarray of shape (batch_size, neurons_in_this_layer)
284-
310+
sequence_step: int
311+
last_sequence: bool
285312
Returns
286313
np.ndarray of shape (batch_size, neurons_in_previous_layer), where neurons_in_previous_layer is
287314
the neuron count of the layer to the left (i.e., the input to this layer).
288315
"""
289-
pass
316+
batch_size = R.shape[0]
317+
# MxM matrix which is fully connected to itself. Thus, M = neurons_in_this_layer
318+
# Page 18-20 in the slides
319+
320+
# Save the output Jacobian as we need it in the recurrent sequence
321+
if self.output_jacobian is None:
322+
# Treated as a normal dense layer
323+
self.output_jacobian = R
324+
else:
325+
# (batch_size, neurons_in_this_layer)
326+
activation_gradients = derivative_activation_function(
327+
self.activation_function, self.activations_history[sequence_step]).T
328+
# ! Shouldn't we derive the activation function here?
329+
# x = self.activations_history[sequence_step].T # ?
330+
x = activation_gradients # ?
331+
332+
# (batch_size, neurons_in_this_layer, neurons_in_this_layer)
333+
diag_matrix = np.empty((batch_size, x.shape[1], x.shape[1]))
334+
for batch in range(batch_size):
335+
diag_matrix[batch] = np.diag(x[batch])
336+
337+
# (batch_size, neurons_in_this_layer, neurons_in_this_layer) @ (neurons_in_this_layer, neurons_in_this_layer)
338+
recurrent_jacobian = np.matmul(
339+
diag_matrix, self.recurrent_weights.T)
340+
341+
assert recurrent_jacobian.shape == (
342+
batch_size, self.neuron_count, self.neuron_count)
343+
344+
part_2 = np.empty_like(R)
345+
for i in range(batch_size):
346+
output_jacobian = self.output_jacobian[i]
347+
output_jacobian = output_jacobian.reshape(
348+
(1, output_jacobian.shape[0]))
349+
350+
part_2[i] = np.matmul(output_jacobian, recurrent_jacobian[i])
351+
352+
# Page 17 in the slides
353+
R += part_2
354+
self.output_jacobian = R # (batch_size, neurons_in_this_layer)
355+
356+
# (batch_size, neurons_in_this_layer)
357+
activation_gradients = derivative_activation_function(
358+
self.activation_function, self.activations_history[sequence_step]).T
359+
R *= activation_gradients
360+
361+
# Gradients for weights and bias
362+
363+
# Divide by batch_size to get the average gradients over the batch
364+
# The average works because matrix multiplication sums the gradients
365+
gradient_weights = np.matmul(
366+
self.inputs_history[sequence_step], R) / batch_size
367+
gradient_bias = R.sum(axis=0, keepdims=True).T / batch_size
368+
369+
# TODO may be moved up/down, read up on this (before or after R update)
370+
if self.gradient_weights is None:
371+
self.gradient_weights = gradient_weights
372+
else:
373+
self.gradient_weights += gradient_weights # Accumulate the gradients as we go
374+
375+
if self.gradient_bias is None:
376+
self.gradient_bias = gradient_bias
377+
else:
378+
self.gradient_bias += gradient_bias # Accumulate the gradients as we go
379+
380+
if last_sequence:
381+
# Update parameters on the last sequence
382+
self.weights -= self.learning_rate * self.gradient_weights
383+
self.bias -= self.learning_rate * self.gradient_bias
384+
385+
# assert self.output_jacobian.shape == (batch_size, self.neurons_in_previous_layer), "Expected {}, got {}".format(
386+
# (batch_size, self.neurons_in_previous_layer), self.output_jacobian.shape)
387+
388+
return np.matmul(R, self.weights.T)
389+
390+
def init_new_sequence(self):
391+
super().init_new_sequence()
392+
self.output_jacobian = None
290393

291394

292395
if __name__ == "__main__":

0 commit comments

Comments
 (0)