Add special case to avoid quantizing conv in Moonshine

- Add a define to prevent quantizing the first conv layers in the Moonshine preprocessor - Add options to enable rotary positional embeddings in the Transformer Encoder spec.
OpenNMT · Nov 5, 2024 · c09c876 · c09c876
1 parent 6373848
commit c09c876
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 0 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -22,6 +22,11 @@ option(BUILD_TESTS "Compile the tests" OFF)
 option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 option(WITH_TENSOR_PARALLEL "Compile with NCCL and MPI backend" OFF)
 option(WITH_FLASH_ATTN "Compile with Flash Attention 2" OFF)
+option(MOONSHINE "Compile with moonshine specializations" OFF)
+
+if (MOONSHINE)
+  add_definitions(-DMOONSHINE)
+endif()
 
 if(ENABLE_PROFILING)
   message(STATUS "Enable profiling support")

diff --git a/python/ctranslate2/specs/transformer_spec.py b/python/ctranslate2/specs/transformer_spec.py
@@ -22,6 +22,11 @@ def __init__(
         relative_attention_bias: bool = False,
         ffn_glu: bool = False,
         rms_norm: bool = False,
+        rotary_dim: Optional[int] = None,
+        rotary_interleave: bool = True,
+        rotary_scaling_type: Optional[attention_spec.RotaryScalingType] = None,
+        rotary_scaling_factor: float = 1,
+        rotary_base: float = 10000,
         multi_query_attention: bool = False,
     ):
         """Initializes a Transformer encoder specification.
@@ -66,6 +71,11 @@ def __init__(
                 relative_attention_bias=relative_attention_bias,
                 ffn_glu=ffn_glu,
                 rms_norm=rms_norm,
+                rotary_dim=rotary_dim,
+                rotary_interleave=rotary_interleave,
+                rotary_scaling_type=rotary_scaling_type,
+                rotary_scaling_factor=rotary_scaling_factor,
+                rotary_base=rotary_base,
                 num_heads_kv=1 if multi_query_attention else None,
             )
             for _ in range(num_layers)
@@ -251,6 +261,11 @@ def __init__(
         relative_attention_bias=False,
         ffn_glu=False,
         rms_norm=False,
+        rotary_dim=None,
+        rotary_interleave=True,
+        rotary_scaling_type=None,
+        rotary_scaling_factor=1,
+        rotary_base=10000,
         num_heads_kv=None,
         sliding_window=None,
     ):
@@ -259,6 +274,11 @@ def __init__(
             relative_position=relative_position,
             relative_attention_bias=relative_attention_bias,
             rms_norm=rms_norm,
+            rotary_dim=rotary_dim,
+            rotary_interleave=rotary_interleave,
+            rotary_scaling_type=rotary_scaling_type,
+            rotary_scaling_factor=rotary_scaling_factor,
+            rotary_base=rotary_base,
             num_heads_kv=num_heads_kv,
             sliding_window=sliding_window,
         )

diff --git a/src/models/model.cc b/src/models/model.cc
@@ -213,6 +213,9 @@ namespace ctranslate2 {
               if (device == Device::CUDA
 #ifdef CT2_WITH_DNNL
                 || true
+#endif
+#ifdef MOONSHINE
+                || true
 #endif
                 ) {
                 variable_weight_dtype = float_dtype;