[Fix] Lib with small max_seq_len incompatible with prebuilt weight (#840)

MasterJH5574 · web-flow · commit cd99740ce921 · 2023-08-30T10:51:56.000-04:00
This PR fixes an issue introduced by #780, which broke our intended behavior to make the cos/sin shape independent of the max sequence length, so that no matter what max sequence length people use, they can always use a same set of prebuilt weight and do not need to clone different weight repositories. This intended behavior is broken by #780. However, it is true that the needs for larger max sequence length are growing. Prior to #780, when the max sequence length is larger than 2048, the cached cos/sin do not work anymore and break. To be compatible as much as possible, this PR changes the behavior to "taking the maximum value of 2048 and the specified max sequence length when building the model lib". With this fix, when the maximum sequence length is smaller than 2048, we are still able to use the prebuilt weights. And when it is larger than 2048, we will only be able to use the weight converted along the build.
diff --git a/mlc_llm/relax_model/llama.py b/mlc_llm/relax_model/llama.py
@@ -577,13 +577,13 @@ def __init__(self, config: LlamaConfig, sep_embed: bool = False):
         assert config.hidden_size % config.num_attention_heads == 0
         head_dim = config.hidden_size // config.num_attention_heads
 
-        # Set the cached sin/cos to the max seq len.
+        # Set the cached sin/cos to the maximum of 2048 and max seq len.
         # This will be eliminated further with online rotary embedding calculation.
         self.cos_cached = nn.Parameter(
-            (config.max_sequence_length, head_dim), dtype=config.dtype, name="cos_cached"
+            (max(config.max_sequence_length, 2048), head_dim), dtype=config.dtype, name="cos_cached"
         )
         self.sin_cached = nn.Parameter(
-            (config.max_sequence_length, head_dim), dtype=config.dtype, name="sin_cached"
+            (max(config.max_sequence_length, 2048), head_dim), dtype=config.dtype, name="sin_cached"
         )
         ############ End ############
 
@@ -892,9 +892,9 @@ def f_compute_relax_param(relax_pname: str, torch_params: List[Any]):
     inv_freq = 1.0 / (
         config.position_embedding_base ** (np.arange(0, head_dim, 2).astype("float32") / head_dim)
     )
-    # Set the cached sin/cos to the max sequence length.
+    # Set the cached sin/cos to the maximum of 2048 and max sequence length.
     # This will be eliminated further with online rotary embedding calculation.
-    t = np.arange(config.max_sequence_length, dtype=inv_freq.dtype)
+    t = np.arange(max(config.max_sequence_length, 2048), dtype=inv_freq.dtype)
     freqs = np.einsum("i,j->ij", t, inv_freq)
     emb = np.concatenate((freqs, freqs), axis=-1)
     param_list[-2] = tvm.nd.array(np.cos(emb).astype(config.dtype), device)
diff --git a/mlc_llm/transform/fuse_split_rotary_embedding.py b/mlc_llm/transform/fuse_split_rotary_embedding.py
@@ -15,6 +15,7 @@
 
 def get_split_rotary(num_attention_heads, head_dim, max_sequence_length=2048):
     hidden_size = num_attention_heads * head_dim
+    max_sequence_length = max(max_sequence_length, 2048)
 
     @T.prim_func
     def split_rotary(
@@ -77,6 +78,7 @@ def split_rotary(
 
 def fuse_split_rotary_embedding(mod, num_attention_heads, hidden_size, max_sequence_length=2048):
     head_dim = hidden_size // num_attention_heads
+    max_sequence_length = max(max_sequence_length, 2048)
 
     mod["split_rotary"] = get_split_rotary(num_attention_heads, head_dim, max_sequence_length)