multihead_attention: remove redundand constructor args for attention layers

OneAdder · OneAdder · commit 2731d633da45 · 2025-02-17T14:35:02.000+04:00
diff --git a/src/nf/nf_cross_attention_layer.f90 b/src/nf/nf_cross_attention_layer.f90
@@ -20,38 +20,19 @@ module nf_cross_attention_layer
   end type cross_attention_layer
 
   interface cross_attention_layer
-    module function cross_attention_layer_cons(sequence_length, model_dimension, n_heads) result(res)
+    module function cross_attention_layer_cons(n_heads) result(res)
       !! This function returns the `cross_attention_layer` instance.
       integer, intent(in) :: sequence_length, model_dimension, n_heads
       type(cross_attention_layer) :: res
     end function cross_attention_layer_cons
   end interface cross_attention_layer
 
 contains
-  module function cross_attention_layer_cons(sequence_length, model_dimension, n_heads) result(res)
+  module function cross_attention_layer_cons(n_heads) result(res)
     !! This function returns the `cross_attention_layer` instance.
-    integer, intent(in) :: sequence_length, model_dimension, n_heads
+    integer, intent(in) :: n_heads
     type(cross_attention_layer) :: res
-    res % sequence_length = sequence_length
-    res % model_dimension = model_dimension
     res % n_heads = n_heads
-
-    if (mod(model_dimension, n_heads) /= 0) then
-      write(stderr, '(a)'), 'Number of heads must be divisible by model dimension'
-      error stop
-    end if
-    res % head_size = model_dimension / n_heads
-
-    res % query_layer = linear2d_layer(model_dimension)
-    res % key_layer = linear2d_layer(model_dimension)
-    res % value_layer = linear2d_layer(model_dimension)
-    res % output_layer = linear2d_layer(model_dimension)
-    call res % query_layer % init([sequence_length, model_dimension])
-    call res % key_layer % init([sequence_length, model_dimension])
-    call res % value_layer % init([sequence_length, model_dimension])
-    call res % output_layer % init([sequence_length, model_dimension])
-
-    res % softmax_func = softmax()
   end function cross_attention_layer_cons
 
   module subroutine backward(self, input, gradient)
diff --git a/src/nf/nf_multihead_attention.f90 b/src/nf/nf_multihead_attention.f90
@@ -51,9 +51,9 @@ module nf_multihead_attention_layer
   end type multihead_attention_layer
 
   interface multihead_attention_layer
-    module function multihead_attention_layer_cons(sequence_length, model_dimension, n_heads) result(res)
+    module function multihead_attention_layer_cons(n_heads) result(res)
       !! This function returns the `multihead_attention_layer` instance.
-      integer, intent(in) :: sequence_length, model_dimension, n_heads
+      integer, intent(in) :: n_heads
       type(multihead_attention_layer) :: res
     end function multihead_attention_layer_cons
   end interface multihead_attention_layer
diff --git a/src/nf/nf_multihead_attention_submodule.f90 b/src/nf/nf_multihead_attention_submodule.f90
@@ -7,29 +7,11 @@
   implicit none
 
 contains
-  module function multihead_attention_layer_cons(sequence_length, model_dimension, n_heads) result(res)
-    integer, intent(in) :: sequence_length, model_dimension, n_heads
+  module function multihead_attention_layer_cons(n_heads) result(res)
+    integer, intent(in) :: n_heads
     type(multihead_attention_layer) :: res
-    res % sequence_length = sequence_length
-    res % model_dimension = model_dimension
-    res % n_heads = n_heads
 
-    if (mod(model_dimension, n_heads) /= 0) then
-      write(stderr, '(a)'), 'Number of heads must be divisible by model dimension'
-      error stop
-    end if
-    res % head_size = model_dimension / n_heads
-
-    res % query_layer = linear2d_layer(model_dimension)
-    res % key_layer = linear2d_layer(model_dimension)
-    res % value_layer = linear2d_layer(model_dimension)
-    res % output_layer = linear2d_layer(model_dimension)
-    call res % query_layer % init([sequence_length, model_dimension])
-    call res % key_layer % init([sequence_length, model_dimension])
-    call res % value_layer % init([sequence_length, model_dimension])
-    call res % output_layer % init([sequence_length, model_dimension])
-
-    res % softmax_func = softmax()
+    res % n_heads = n_heads
   end function multihead_attention_layer_cons
 
   module subroutine common_backward(self, input, gradient)
@@ -325,6 +307,28 @@ module subroutine init_base(self, input_shape)
     class(multihead_attention_layer), intent(in out) :: self
     integer, intent(in) :: input_shape(:)
 
+    if (size(input_shape) /= 2) then
+      error stop "MultiHead Attention accepts 2D input"
+    end if
+    self % sequence_length = input_shape(1)
+    self % model_dimension = input_shape(2)
+
+    if (mod(self % model_dimension, self % n_heads) /= 0) then
+      write(stderr, '(a)'), 'Number of heads must be divisible by model dimension'
+      error stop
+    end if
+    self % head_size = self % model_dimension / self % n_heads
+    self % softmax_func = softmax()
+
+    self % query_layer = linear2d_layer(self % model_dimension)
+    self % key_layer = linear2d_layer(self % model_dimension)
+    self % value_layer = linear2d_layer(self % model_dimension)
+    self % output_layer = linear2d_layer(self % model_dimension)
+    call self % query_layer % init([self % sequence_length, self % model_dimension])
+    call self % key_layer % init([self % sequence_length, self % model_dimension])
+    call self % value_layer % init([self % sequence_length, self % model_dimension])
+    call self % output_layer % init([self % sequence_length, self % model_dimension])
+
     allocate(self % attention_matrix(self % sequence_length, self % sequence_length, self % n_heads))
     allocate(self % sdpa(self % sequence_length, self % head_size, self % n_heads))
     allocate(self % output(self % sequence_length, self % model_dimension))
diff --git a/src/nf/nf_self_attention_layer.f90 b/src/nf/nf_self_attention_layer.f90
@@ -20,38 +20,19 @@ module nf_self_attention_layer
   end type self_attention_layer
 
   interface self_attention_layer
-    module function self_attention_layer_cons(sequence_length, model_dimension, n_heads) result(res)
+    module function self_attention_layer_cons(n_heads) result(res)
       !! This function returns the `self_attention_layer` instance.
-      integer, intent(in) :: sequence_length, model_dimension, n_heads
+      integer, intent(in) :: n_heads
       type(self_attention_layer) :: res
     end function self_attention_layer_cons
   end interface self_attention_layer
 
 contains
-  module function self_attention_layer_cons(sequence_length, model_dimension, n_heads) result(res)
+  module function self_attention_layer_cons(n_heads) result(res)
     !! This function returns the `self_attention_layer` instance.
-    integer, intent(in) :: sequence_length, model_dimension, n_heads
+    integer, intent(in) :: n_heads
     type(self_attention_layer) :: res
-    res % sequence_length = sequence_length
-    res % model_dimension = model_dimension
     res % n_heads = n_heads
-
-    if (mod(model_dimension, n_heads) /= 0) then
-      write(stderr, '(a)'), 'Number of heads must be divisible by model dimension'
-      error stop
-    end if
-    res % head_size = model_dimension / n_heads
-
-    res % query_layer = linear2d_layer(model_dimension)
-    res % key_layer = linear2d_layer(model_dimension)
-    res % value_layer = linear2d_layer(model_dimension)
-    res % output_layer = linear2d_layer(model_dimension)
-    call res % query_layer % init([sequence_length, model_dimension])
-    call res % key_layer % init([sequence_length, model_dimension])
-    call res % value_layer % init([sequence_length, model_dimension])
-    call res % output_layer % init([sequence_length, model_dimension])
-
-    res % softmax_func = softmax()
   end function self_attention_layer_cons
 
   module subroutine backward(self, input, gradient)
diff --git a/test/test_multihead_attention_layer.f90 b/test/test_multihead_attention_layer.f90
@@ -14,8 +14,8 @@ program test_multihead_attention_layer
   real :: minput(3, 4) = reshape([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.11, 0.12], [3, 4])
   real :: output(3, 2, 2)
 
-  attention = multihead_attention_layer(sequence_length=3, model_dimension=4, n_heads=2)
-  call attention % init_base([0])
+  attention = multihead_attention_layer(n_heads=2)
+  call attention % init_base([3, 4])
   call set_weights(attention)
 
   call test_multihead_attention_split_heads(attention, sample_input, ok, split_heads_output)
@@ -210,8 +210,8 @@ subroutine test_multihead_attention_forward_reallife_shape(ok)
 
     call random_number(input)
 
-    attention = multihead_attention_layer(sequence_length=148, model_dimension=512, n_heads=8)
-    call attention % init_base([0])
+    attention = multihead_attention_layer(n_heads=8)
+    call attention % init_base([148, 512])
     call set_weights(attention)
 
     call attention % common_forward(input, input, input)
@@ -317,8 +317,8 @@ subroutine test_self_attention(ok)
         0.350671142, 0.607403040, 0.350671142, 0.607403040, 0.350671142, 0.607403040&
     ]
 
-    attention = self_attention_layer(sequence_length=2, model_dimension=3, n_heads=1)
-    call attention % init([0])
+    attention = self_attention_layer(n_heads=1)
+    call attention % init([2, 3])
     attention % query_layer % weights = 0.1
     attention % key_layer % weights = 0.1
     attention % value_layer % weights = 0.1
@@ -366,8 +366,8 @@ subroutine test_cross_attention(ok)
     input(1, :, :) = query
     input(2, :, :) = key_value
 
-    attention = cross_attention_layer(sequence_length=2, model_dimension=3, n_heads=1)
-    call attention % init([0])
+    attention = cross_attention_layer(n_heads=1)
+    call attention % init([2, 3])
     attention % query_layer % weights = 0.1
     attention % key_layer % weights = 0.1
     attention % value_layer % weights = 0.1
@@ -396,4 +396,4 @@ subroutine test_cross_attention(ok)
       write(stderr, '(a)') 'backward returned incorrect key-value values.. failed'
     end if
   end subroutine test_cross_attention
-end program test_multihead_attention_layer
+end program test_multihead_attention_layer