From c09c8765b4e2315fc87d3f3716ca4574da8c7d6e Mon Sep 17 00:00:00 2001 From: Nat Jeffries Date: Mon, 4 Nov 2024 21:15:54 -0800 Subject: [PATCH] Add special case to avoid quantizing conv in Moonshine - Add a define to prevent quantizing the first conv layers in the Moonshine preprocessor - Add options to enable rotary positional embeddings in the Transformer Encoder spec. --- CMakeLists.txt | 5 +++++ python/ctranslate2/specs/transformer_spec.py | 20 ++++++++++++++++++++ src/models/model.cc | 3 +++ 3 files changed, 28 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 962976c55..e48e1a2c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,11 @@ option(BUILD_TESTS "Compile the tests" OFF) option(BUILD_SHARED_LIBS "Build shared libraries" ON) option(WITH_TENSOR_PARALLEL "Compile with NCCL and MPI backend" OFF) option(WITH_FLASH_ATTN "Compile with Flash Attention 2" OFF) +option(MOONSHINE "Compile with moonshine specializations" OFF) + +if (MOONSHINE) + add_definitions(-DMOONSHINE) +endif() if(ENABLE_PROFILING) message(STATUS "Enable profiling support") diff --git a/python/ctranslate2/specs/transformer_spec.py b/python/ctranslate2/specs/transformer_spec.py index 230e62cfd..60691d27c 100644 --- a/python/ctranslate2/specs/transformer_spec.py +++ b/python/ctranslate2/specs/transformer_spec.py @@ -22,6 +22,11 @@ def __init__( relative_attention_bias: bool = False, ffn_glu: bool = False, rms_norm: bool = False, + rotary_dim: Optional[int] = None, + rotary_interleave: bool = True, + rotary_scaling_type: Optional[attention_spec.RotaryScalingType] = None, + rotary_scaling_factor: float = 1, + rotary_base: float = 10000, multi_query_attention: bool = False, ): """Initializes a Transformer encoder specification. @@ -66,6 +71,11 @@ def __init__( relative_attention_bias=relative_attention_bias, ffn_glu=ffn_glu, rms_norm=rms_norm, + rotary_dim=rotary_dim, + rotary_interleave=rotary_interleave, + rotary_scaling_type=rotary_scaling_type, + rotary_scaling_factor=rotary_scaling_factor, + rotary_base=rotary_base, num_heads_kv=1 if multi_query_attention else None, ) for _ in range(num_layers) @@ -251,6 +261,11 @@ def __init__( relative_attention_bias=False, ffn_glu=False, rms_norm=False, + rotary_dim=None, + rotary_interleave=True, + rotary_scaling_type=None, + rotary_scaling_factor=1, + rotary_base=10000, num_heads_kv=None, sliding_window=None, ): @@ -259,6 +274,11 @@ def __init__( relative_position=relative_position, relative_attention_bias=relative_attention_bias, rms_norm=rms_norm, + rotary_dim=rotary_dim, + rotary_interleave=rotary_interleave, + rotary_scaling_type=rotary_scaling_type, + rotary_scaling_factor=rotary_scaling_factor, + rotary_base=rotary_base, num_heads_kv=num_heads_kv, sliding_window=sliding_window, ) diff --git a/src/models/model.cc b/src/models/model.cc index b8e1c2d8f..c2bb8e3f6 100644 --- a/src/models/model.cc +++ b/src/models/model.cc @@ -213,6 +213,9 @@ namespace ctranslate2 { if (device == Device::CUDA #ifdef CT2_WITH_DNNL || true +#endif +#ifdef MOONSHINE + || true #endif ) { variable_weight_dtype = float_dtype;