@@ -142,9 +142,7 @@ def __init__(self, neuron_count, neurons_in_previous_layer, activation_function,
142
142
self .verbose = verbose
143
143
self .name = name
144
144
145
- # Cache all activations and inputs because we need it to do backpropagation
146
- self .inputs_history = []
147
- self .activations_history = []
145
+ self .init_new_sequence ()
148
146
149
147
def forward_pass (self , X ):
150
148
"""
@@ -175,19 +173,21 @@ def forward_pass(self, X):
175
173
176
174
return activations
177
175
178
- def backward_pass (self , R ):
176
+ def backward_pass (self , R , sequence_step , last_sequence ):
179
177
"""
180
178
Performs backward pass over one layer.
181
179
182
180
Args
183
181
R: np.ndarray of shape (batch_size, neurons_in_this_layer)
182
+ TODO
184
183
185
184
Returns
186
185
np.ndarray of shape (batch_size, neurons_in_previous_layer), where neurons_in_previous_layer is
187
186
the neuron count of the layer to the left (i.e., the input to this layer).
188
187
"""
189
188
if self .activation_function == ActivationFunction .SOFTMAX :
190
- activations = self .activations .T # (batch_size, neurons)
189
+ # (batch_size, neurons_in_this_layer)
190
+ activations = self .activations_history [sequence_step ].T
191
191
192
192
batch_size = activations .shape [0 ]
193
193
for b in range (batch_size ):
@@ -207,22 +207,46 @@ def backward_pass(self, R):
207
207
208
208
return R
209
209
210
- activation_gradient = derivative_activation_function (
211
- self .activation_function , self .activations ).T
212
- R *= activation_gradient
210
+ # (batch_size, neurons_in_this_layer)
211
+ activation_gradients = derivative_activation_function (
212
+ self .activation_function , self .activations_history [sequence_step ]).T
213
+ R *= activation_gradients
213
214
214
215
# Gradients for weights and bias
215
216
batch_size = R .shape [0 ]
216
217
# Divide by batch_size to get the average gradients over the batch
217
218
# The average works because matrix multiplication sums the gradients
218
- gradient_weights = np .matmul (self .inputs , R ) / batch_size
219
+
220
+ # (neurons_in_previous_layer, batch_size) @ (batch_size, neurons_in_this_layer)
221
+ gradient_weights = np .matmul (
222
+ self .inputs_history [sequence_step ], R ) / batch_size
219
223
gradient_bias = R .sum (axis = 0 , keepdims = True ).T / batch_size
220
224
221
- self .weights -= self .learning_rate * gradient_weights
222
- self .bias -= self .learning_rate * gradient_bias
225
+ if self .gradient_weights is None :
226
+ self .gradient_weights = gradient_weights
227
+ else :
228
+ self .gradient_weights += gradient_weights # Accumulate the gradients as we go
229
+
230
+ if self .gradient_bias is None :
231
+ self .gradient_bias = gradient_bias
232
+ else :
233
+ self .gradient_bias += gradient_bias # Accumulate the gradients as we go
234
+
235
+ if last_sequence :
236
+ # Update parameters on the last sequence
237
+ self .weights -= self .learning_rate * self .gradient_weights
238
+ self .bias -= self .learning_rate * self .gradient_bias
223
239
240
+ # (batch_size, neurons_in_this_layer) @ (neurons_in_this_layer, neurons_in_previous_layer)
224
241
return np .matmul (R , self .weights .T )
225
242
243
+ def init_new_sequence (self ):
244
+ # Cache all activations and inputs because we need it to do backpropagation
245
+ self .inputs_history = []
246
+ self .activations_history = []
247
+ self .gradient_weights = None
248
+ self .gradient_bias = None
249
+
226
250
def __str__ (self ):
227
251
return "{} neurons with {} as activation function" .format (self .neuron_count , self .activation_function )
228
252
@@ -244,6 +268,8 @@ def __init__(self, neuron_count, neurons_in_previous_layer, activation_function,
244
268
self .recurrent_weights = init_weights_with_range (
245
269
initial_weight_ranges [0 ], initial_weight_ranges [1 ], neuron_count , neuron_count )
246
270
271
+ self .output_jacobian = None
272
+
247
273
def forward_pass (self , X ):
248
274
"""
249
275
Args:
@@ -275,18 +301,95 @@ def forward_pass(self, X):
275
301
276
302
return activations
277
303
278
- def backward_pass (self , R ):
304
+ def backward_pass (self , R , sequence_step , last_sequence ):
279
305
"""
280
306
Performs backward pass over one layer.
281
307
282
308
Args
283
309
R: np.ndarray of shape (batch_size, neurons_in_this_layer)
284
-
310
+ sequence_step: int
311
+ last_sequence: bool
285
312
Returns
286
313
np.ndarray of shape (batch_size, neurons_in_previous_layer), where neurons_in_previous_layer is
287
314
the neuron count of the layer to the left (i.e., the input to this layer).
288
315
"""
289
- pass
316
+ batch_size = R .shape [0 ]
317
+ # MxM matrix which is fully connected to itself. Thus, M = neurons_in_this_layer
318
+ # Page 18-20 in the slides
319
+
320
+ # Save the output Jacobian as we need it in the recurrent sequence
321
+ if self .output_jacobian is None :
322
+ # Treated as a normal dense layer
323
+ self .output_jacobian = R
324
+ else :
325
+ # (batch_size, neurons_in_this_layer)
326
+ activation_gradients = derivative_activation_function (
327
+ self .activation_function , self .activations_history [sequence_step ]).T
328
+ # ! Shouldn't we derive the activation function here?
329
+ # x = self.activations_history[sequence_step].T # ?
330
+ x = activation_gradients # ?
331
+
332
+ # (batch_size, neurons_in_this_layer, neurons_in_this_layer)
333
+ diag_matrix = np .empty ((batch_size , x .shape [1 ], x .shape [1 ]))
334
+ for batch in range (batch_size ):
335
+ diag_matrix [batch ] = np .diag (x [batch ])
336
+
337
+ # (batch_size, neurons_in_this_layer, neurons_in_this_layer) @ (neurons_in_this_layer, neurons_in_this_layer)
338
+ recurrent_jacobian = np .matmul (
339
+ diag_matrix , self .recurrent_weights .T )
340
+
341
+ assert recurrent_jacobian .shape == (
342
+ batch_size , self .neuron_count , self .neuron_count )
343
+
344
+ part_2 = np .empty_like (R )
345
+ for i in range (batch_size ):
346
+ output_jacobian = self .output_jacobian [i ]
347
+ output_jacobian = output_jacobian .reshape (
348
+ (1 , output_jacobian .shape [0 ]))
349
+
350
+ part_2 [i ] = np .matmul (output_jacobian , recurrent_jacobian [i ])
351
+
352
+ # Page 17 in the slides
353
+ R += part_2
354
+ self .output_jacobian = R # (batch_size, neurons_in_this_layer)
355
+
356
+ # (batch_size, neurons_in_this_layer)
357
+ activation_gradients = derivative_activation_function (
358
+ self .activation_function , self .activations_history [sequence_step ]).T
359
+ R *= activation_gradients
360
+
361
+ # Gradients for weights and bias
362
+
363
+ # Divide by batch_size to get the average gradients over the batch
364
+ # The average works because matrix multiplication sums the gradients
365
+ gradient_weights = np .matmul (
366
+ self .inputs_history [sequence_step ], R ) / batch_size
367
+ gradient_bias = R .sum (axis = 0 , keepdims = True ).T / batch_size
368
+
369
+ # TODO may be moved up/down, read up on this (before or after R update)
370
+ if self .gradient_weights is None :
371
+ self .gradient_weights = gradient_weights
372
+ else :
373
+ self .gradient_weights += gradient_weights # Accumulate the gradients as we go
374
+
375
+ if self .gradient_bias is None :
376
+ self .gradient_bias = gradient_bias
377
+ else :
378
+ self .gradient_bias += gradient_bias # Accumulate the gradients as we go
379
+
380
+ if last_sequence :
381
+ # Update parameters on the last sequence
382
+ self .weights -= self .learning_rate * self .gradient_weights
383
+ self .bias -= self .learning_rate * self .gradient_bias
384
+
385
+ # assert self.output_jacobian.shape == (batch_size, self.neurons_in_previous_layer), "Expected {}, got {}".format(
386
+ # (batch_size, self.neurons_in_previous_layer), self.output_jacobian.shape)
387
+
388
+ return np .matmul (R , self .weights .T )
389
+
390
+ def init_new_sequence (self ):
391
+ super ().init_new_sequence ()
392
+ self .output_jacobian = None
290
393
291
394
292
395
if __name__ == "__main__" :
0 commit comments