misc fixes (#192)

* misc fixes * add datasets dependency
eole-nlp · Jan 21, 2025 · 4e50ffb · 4e50ffb
1 parent d2992b7
commit 4e50ffb
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 9 deletions.
diff --git a/eole/bin/model/lora_weights.py b/eole/bin/model/lora_weights.py
@@ -82,7 +82,6 @@ def run(cls, args):
             # use compute_dtype from lora finetuning
             new_config.training.compute_dtype = config.training.compute_dtype
         elif args.action == "concat":
-            model.half()  # We keep FP16 for all
             optim = lora_checkpoint["optim"]
             model_state_dict = model.state_dict()
             new_config = config
@@ -108,11 +107,11 @@ def run(cls, args):
             shard_dict = {}
             f.append(safe_open(shard, framework="pt", device="cpu"))
             for key in f[i].keys():
-                shard_dict[key] = model_state_dict[key]
+                shard_dict[key] = model_state_dict[key].to(running_config.storage_dtype)
             if i == 0:
                 for key in model_state_dict.keys():
                     if "estimator" in key:
-                        shard_dict[key] = model_state_dict[key]
+                        shard_dict[key] = model_state_dict[key].to(running_config.storage_dtype)
             logger.info("saving shard" + args.output + "/model.{:02d}.safetensors".format(i))
             save_file(
                 shard_dict,

diff --git a/eole/modules/multi_headed_attn.py b/eole/modules/multi_headed_attn.py
@@ -405,6 +405,7 @@ def forward(
                 not self.flash
                 or self.position_encoding_type in [PositionEncodingType.Relative, PositionEncodingType.Alibi]
                 or query.dtype not in [torch.float16, torch.bfloat16]  # to match with flash
+                or query.device == torch.device("cpu")
             ):
                 key, value, query = self._prepare_inputs_w_cache(
                     query,
@@ -419,8 +420,8 @@ def forward(
                 cache_len = self._expand_cache(32, step, key)
                 if position_embeddings is not None:
                     rotdim = self.rotary_dim // 2
-                    cos = position_embeddings[0][:, :rotdim].to(query.dtype)
-                    sin = position_embeddings[1][:, :rotdim].to(query.dtype)
+                    cos = position_embeddings[0][:, :rotdim].to(query.dtype).contiguous()
+                    sin = position_embeddings[1][:, :rotdim].to(query.dtype).contiguous()
                 else:
                     cos, sin = None, None
                 # restore initial tgt_pad_mask - migth be better to store it instead.
@@ -435,8 +436,8 @@ def forward(
                     self.layer_cache[1]["values"].transpose(1, 2),
                     key.transpose(1, 2),
                     value.transpose(1, 2),
-                    rotary_cos=cos.contiguous(),
-                    rotary_sin=sin.contiguous(),
+                    rotary_cos=cos,
+                    rotary_sin=sin,
                     cache_seqlens=cache_len - 1,
                     cache_leftpad=cache_leftpad,
                     causal=step == 0,

diff --git a/requirements.opt.txt b/requirements.opt.txt
@@ -4,5 +4,4 @@ scipy
 bitsandbytes>=0.41.2
 spacy
 gradio
-autoawq
-pyarrow
+pyarrow
diff --git a/setup.py b/setup.py
@@ -23,6 +23,7 @@
         "fastapi",
         "fasttext-wheel",
         "huggingface_hub",
+        "datasets",
         "numpy<2.0",
         "pandas",
         "protobuf==3.20.1",
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,5 +4,4 @@ scipy @@
     bitsandbytes>=0.41.2
     spacy
     gradio
-    autoawq
-    pyarrow
+    pyarrow