From c09c8765b4e2315fc87d3f3716ca4574da8c7d6e Mon Sep 17 00:00:00 2001
From: Nat Jeffries <natmjeffries@gmail.com>
Date: Mon, 4 Nov 2024 21:15:54 -0800
Subject: [PATCH] Add special case to avoid quantizing conv in Moonshine

- Add a define to prevent quantizing the first conv layers in the
  Moonshine preprocessor
- Add options to enable rotary positional embeddings in the Transformer
  Encoder spec.
---
 CMakeLists.txt                               |  5 +++++
 python/ctranslate2/specs/transformer_spec.py | 20 ++++++++++++++++++++
 src/models/model.cc                          |  3 +++
 3 files changed, 28 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 962976c55..e48e1a2c5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,11 @@ option(BUILD_TESTS "Compile the tests" OFF)
 option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 option(WITH_TENSOR_PARALLEL "Compile with NCCL and MPI backend" OFF)
 option(WITH_FLASH_ATTN "Compile with Flash Attention 2" OFF)
+option(MOONSHINE "Compile with moonshine specializations" OFF)
+
+if (MOONSHINE)
+  add_definitions(-DMOONSHINE)
+endif()
 
 if(ENABLE_PROFILING)
   message(STATUS "Enable profiling support")
diff --git a/python/ctranslate2/specs/transformer_spec.py b/python/ctranslate2/specs/transformer_spec.py
index 230e62cfd..60691d27c 100644
--- a/python/ctranslate2/specs/transformer_spec.py
+++ b/python/ctranslate2/specs/transformer_spec.py
@@ -22,6 +22,11 @@ def __init__(
         relative_attention_bias: bool = False,
         ffn_glu: bool = False,
         rms_norm: bool = False,
+        rotary_dim: Optional[int] = None,
+        rotary_interleave: bool = True,
+        rotary_scaling_type: Optional[attention_spec.RotaryScalingType] = None,
+        rotary_scaling_factor: float = 1,
+        rotary_base: float = 10000,
         multi_query_attention: bool = False,
     ):
         """Initializes a Transformer encoder specification.
@@ -66,6 +71,11 @@ def __init__(
                 relative_attention_bias=relative_attention_bias,
                 ffn_glu=ffn_glu,
                 rms_norm=rms_norm,
+                rotary_dim=rotary_dim,
+                rotary_interleave=rotary_interleave,
+                rotary_scaling_type=rotary_scaling_type,
+                rotary_scaling_factor=rotary_scaling_factor,
+                rotary_base=rotary_base,
                 num_heads_kv=1 if multi_query_attention else None,
             )
             for _ in range(num_layers)
@@ -251,6 +261,11 @@ def __init__(
         relative_attention_bias=False,
         ffn_glu=False,
         rms_norm=False,
+        rotary_dim=None,
+        rotary_interleave=True,
+        rotary_scaling_type=None,
+        rotary_scaling_factor=1,
+        rotary_base=10000,
         num_heads_kv=None,
         sliding_window=None,
     ):
@@ -259,6 +274,11 @@ def __init__(
             relative_position=relative_position,
             relative_attention_bias=relative_attention_bias,
             rms_norm=rms_norm,
+            rotary_dim=rotary_dim,
+            rotary_interleave=rotary_interleave,
+            rotary_scaling_type=rotary_scaling_type,
+            rotary_scaling_factor=rotary_scaling_factor,
+            rotary_base=rotary_base,
             num_heads_kv=num_heads_kv,
             sliding_window=sliding_window,
         )
diff --git a/src/models/model.cc b/src/models/model.cc
index b8e1c2d8f..c2bb8e3f6 100644
--- a/src/models/model.cc
+++ b/src/models/model.cc
@@ -213,6 +213,9 @@ namespace ctranslate2 {
               if (device == Device::CUDA
 #ifdef CT2_WITH_DNNL
                 || true
+#endif
+#ifdef MOONSHINE
+                || true
 #endif
                 ) {
                 variable_weight_dtype = float_dtype;