Allow passing gradient to network % backward() to bypass loss function

milancurcic · milancurcic · commit f6767805b4d9 · 2025-09-15T14:06:41.000-04:00
diff --git a/example/merge_networks.f90 b/example/merge_networks.f90
@@ -34,7 +34,7 @@ program merge_networks
   ! Network 3
   net3 = network([ &
     input(net1_output_size + net2_output_size), &
-    dense(7) & 
+    dense(7) &
   ])
 
   do n = 1, num_iterations
@@ -59,54 +59,16 @@ program merge_networks
 
     call net3 % forward([y1, y2])
 
-    ! Compute the gradients on the 3rd network
+    ! First compute the gradients on net3, then pass the gradients from the first
+    ! hidden layer on net3 to net1 and net2, and compute their gradients.
     call net3 % backward(y)
 
-    ! net3 % update() will clear the gradients immediately after updating
-    ! the weights, so we need to pass the gradients to net1 and net2 first
-
-    ! For net1 and net2, we can't use the existing net % backward() because
-    ! it currently assumes that the output layer gradients are computed based
-    ! on the loss function and not the gradient from the next layer.
-    ! For now, we need to manually pass the gradient from the first hidden layer
-    ! of net3 to the output layers of net1 and net2.
     select type (next_layer => net3 % layers(2) % p)
-      ! Assume net3's first hidden layer is dense;
-      ! would need to be generalized to others.
       type is (dense_layer)
-
-        nn = size(net1 % layers)
-        call net1 % layers(nn) % backward( &
-          net1 % layers(nn - 1), next_layer % gradient(1:net1_output_size) &
-        )
-
-        nn = size(net2 % layers)
-        call net2 % layers(nn) % backward( &
-          net2 % layers(nn - 1), next_layer % gradient(net1_output_size+1:size(next_layer % gradient)) &
-        )
-
+        call net1 % backward(y, gradient=next_layer % gradient(1:net1_output_size))
+        call net2 % backward(y, gradient=next_layer % gradient(net1_output_size+1:size(next_layer % gradient)))
     end select
 
-    ! Compute the gradients on hidden layers of net1, if any
-    do nn = size(net1 % layers)-1, 2, -1
-      select type (next_layer => net1 % layers(nn + 1) % p)
-        type is (dense_layer)
-          call net1 % layers(nn) % backward( &
-            net1 % layers(nn - 1), next_layer % gradient &
-          )
-      end select
-    end do
-
-   ! Compute the gradients on hidden layers of net2, if any
-    do nn = size(net2 % layers)-1, 2, -1
-      select type (next_layer => net2 % layers(nn + 1) % p)
-        type is (dense_layer)
-          call net2 % layers(nn) % backward( &
-            net2 % layers(nn - 1), next_layer % gradient &
-          )
-      end select
-    end do
-
     ! Gradients are now computed on all networks and we can update the weights
     call net1 % update(optimizer=sgd(learning_rate=1.))
     call net2 % update(optimizer=sgd(learning_rate=1.))
diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90
@@ -195,7 +195,7 @@ end function predict_batch_3d
 
   interface
 
-    module subroutine backward(self, output, loss)
+    module subroutine backward(self, output, loss, gradient)
       !! Apply one backward pass through the network.
       !! This changes the state of layers on the network.
       !! Typically used only internally from the `train` method,
@@ -206,6 +206,12 @@ module subroutine backward(self, output, loss)
         !! Output data
       class(loss_type), intent(in), optional :: loss
         !! Loss instance to use. If not provided, the default is quadratic().
+      real, intent(in), optional :: gradient(:)
+        !! Gradient to use for the output layer.
+        !! If not provided, the gradient in the last layer is computed using
+        !! the loss function.
+        !! Passing the gradient is useful for merging/concatenating multiple
+        !! networks.
     end subroutine backward
 
     module integer function get_num_params(self)
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
@@ -115,10 +115,11 @@ module function network_from_layers(layers) result(res)
   end function network_from_layers
 
 
-  module subroutine backward(self, output, loss)
+  module subroutine backward(self, output, loss, gradient)
     class(network), intent(in out) :: self
     real, intent(in) :: output(:)
     class(loss_type), intent(in), optional :: loss
+    real, intent(in), optional :: gradient(:)
     integer :: n, num_layers
 
     ! Passing the loss instance is optional. If not provided, and if the
@@ -140,58 +141,71 @@ module subroutine backward(self, output, loss)
 
     ! Iterate backward over layers, from the output layer
     ! to the first non-input layer
-    do n = num_layers, 2, -1
-
-      if (n == num_layers) then
-        ! Output layer; apply the loss function
-        select type(this_layer => self % layers(n) % p)
-          type is(dense_layer)
-            call self % layers(n) % backward( &
-              self % layers(n - 1), &
-              self % loss % derivative(output, this_layer % output) &
-            )
-          type is(flatten_layer)
-            call self % layers(n) % backward( &
-              self % layers(n - 1), &
-              self % loss % derivative(output, this_layer % output) &
-            )
-        end select
-      else
-        ! Hidden layer; take the gradient from the next layer
-        select type(next_layer => self % layers(n + 1) % p)
-          type is(dense_layer)
-            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
-          type is(dropout_layer)
-            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
-          type is(conv2d_layer)
-            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
-          type is(flatten_layer)
-            if (size(self % layers(n) % layer_shape) == 2) then
-              call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient_2d)
-            else
-              call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient_3d)
-            end if
-          type is(maxpool2d_layer)
-            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
-          type is(reshape3d_layer)
-            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
-          type is(linear2d_layer)
-            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
-          type is(self_attention_layer)
-            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
-          type is(maxpool1d_layer)
-            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
-          type is(reshape2d_layer)
-            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
-          type is(conv1d_layer)
-            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
-          type is(locally_connected2d_layer)
-            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
-          type is(layernorm_layer)
-            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
-        end select
-      end if
 
+    ! Output layer first
+    n = num_layers
+    if (present(gradient)) then
+
+      ! If the gradient is passed, use it directly for the output layer
+      select type(this_layer => self % layers(n) % p)
+        type is(dense_layer)
+          call self % layers(n) % backward(self % layers(n - 1), gradient)
+        type is(flatten_layer)
+          call self % layers(n) % backward(self % layers(n - 1), gradient)
+      end select
+
+    else
+
+      ! Apply the loss function
+      select type(this_layer => self % layers(n) % p)
+        type is(dense_layer)
+          call self % layers(n) % backward( &
+            self % layers(n - 1), &
+            self % loss % derivative(output, this_layer % output) &
+          )
+        type is(flatten_layer)
+          call self % layers(n) % backward( &
+            self % layers(n - 1), &
+            self % loss % derivative(output, this_layer % output) &
+          )
+      end select
+
+    end if
+
+    ! Hidden layers; take the gradient from the next layer
+    do n = num_layers - 1, 2, -1
+      select type(next_layer => self % layers(n + 1) % p)
+        type is(dense_layer)
+          call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
+        type is(dropout_layer)
+          call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
+        type is(conv2d_layer)
+          call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
+        type is(flatten_layer)
+          if (size(self % layers(n) % layer_shape) == 2) then
+            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient_2d)
+          else
+            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient_3d)
+          end if
+        type is(maxpool2d_layer)
+          call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
+        type is(reshape3d_layer)
+          call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
+        type is(linear2d_layer)
+          call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
+        type is(self_attention_layer)
+          call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
+        type is(maxpool1d_layer)
+          call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
+        type is(reshape2d_layer)
+          call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
+        type is(conv1d_layer)
+          call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
+        type is(locally_connected2d_layer)
+          call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
+        type is(layernorm_layer)
+          call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
+      end select
     end do
 
   end subroutine backward