Oneflow-Inc · Lusfie · Oct 8, 2024 · Oct 8, 2024 · Oct 8, 2024 · Oct 8, 2024
diff --git a/infer/aquila.sh b/infer/aquila.sh
@@ -0,0 +1 @@
+python projects/Aquila/pipeline.py --model_path=/root/models/Aquila-7B --mode=huggingface --device=xpu
diff --git a/infer/baichuan.sh b/infer/baichuan.sh
@@ -0,0 +1 @@
+python projects/Baichuan/pipeline.py --model_path=/root/models/Baichuan2-7B-Chat --mode=huggingface --device=xpu
diff --git a/infer/chatglm.sh b/infer/chatglm.sh
@@ -0,0 +1 @@
+python projects/ChatGLM/pipeline.py --model_path=/root/models/chatglm2-6b --mode=huggingface --device=xpu
diff --git a/infer/llama.sh b/infer/llama.sh
@@ -0,0 +1 @@
+python projects/Llama/pipeline.py --model_path=/root/models/Llama-2-7b-chat-hf --mode=huggingface --device=xpu
diff --git a/infer/qwen.sh b/infer/qwen.sh
@@ -0,0 +1 @@
+python projects/Qwen/pipeline.py --model_path=/root/models/Qwen1.5-7B-Chat --mode=huggingface --device=xpu
diff --git a/libai/engine/default.py b/libai/engine/default.py
@@ -306,7 +306,7 @@ def __init__(self, cfg):
         dist.synchronize()
         start_time = time.time()
         logger.info("> Start building model...")
-        self.model = self.build_model(cfg)
+        self.model = self.build_model(cfg).half()
 
         dist.synchronize()
         logger.info(

diff --git a/libai/layers/embedding.py b/libai/layers/embedding.py
@@ -161,7 +161,8 @@ def forward(self, input_ids):
         # [B, S(0)] x [S(0), B] --> [S(0), P]
         #     ↑           ↑            ↑
         #   embed  input_ids    input_embeds
-        input_embeds = flow._C.gather(weight, input_ids, axis=0)
+        with flow.no_grad():
+            input_embeds = flow._C.gather(weight, input_ids, axis=0)
         # Set the embeds sbp from [S(0), P] --> [S(0), B] to get complete embedding results.
         input_embeds = input_embeds.to_global(sbp=dist.get_hidden_sbp())
 

diff --git a/projects/Aquila/configs/aquila_sft.py b/projects/Aquila/configs/aquila_sft.py
@@ -19,7 +19,7 @@
 # Hyperparameters
 weight_decay = 0.1
 learning_rate = 5e-5
-dataset_path = "./alpaca_data"
+dataset_path = "./data/aquila"
 pretrained_model_path = "/root/models/Aquila-7B"
 
 # graph & optim
@@ -63,10 +63,10 @@
 
 train.update(
     dict(
-        output_dir="./sft_result",
-        train_micro_batch_size=4,
+        output_dir="./sft_result/aquila",
+        train_micro_batch_size=1,
         test_micro_batch_size=1,
-        train_epoch=5,
+        train_epoch=1,
         train_iter=1,
         log_period=1,
         warmup_ratio=1 / 3,
@@ -75,7 +75,7 @@
         train_with_fp16=True,
         amp=dict(enabled=True),
         activation_checkpoint=dict(enabled=True),
-        input_placement_device="cuda",
+        input_placement_device="xpu",
         checkpointer=dict(
             period=100,
             max_to_keep=20,
@@ -85,7 +85,7 @@
             tensor_parallel_size=1,
             pipeline_parallel_size=1,
             pipeline_num_layers=cfg.hidden_layers,
-            device_type="cuda",
+            device_type="xpu",
         ),
         evaluation=dict(
             enabled=False,

diff --git a/projects/Aquila/utils/data_prepare.py b/projects/Aquila/utils/data_prepare.py
@@ -18,9 +18,9 @@
 
 
 def prepare(
-    destination_path: Path = Path("./alpaca_data"),
+    destination_path: Path = Path("./data/aquila"),
     checkpoint_dir: Path = Path("/root/models/Aquila-7B"),
-    test_split_fraction: float = 0.03865,  # to get exactly 2000 test samples,
+    test_split_fraction: float = 0.60,  # to get exactly 2000 test samples,
     seed: int = 42,
     mask_inputs: bool = False,  # as in alpaca-lora
     data_file_name: str = "alpaca_data_cleaned_archive.json",

diff --git a/projects/Baichuan/configs/baichuan_config.py b/projects/Baichuan/configs/baichuan_config.py
@@ -58,5 +58,5 @@
 tokenization = OmegaConf.create()
 tokenization.make_vocab_size_divisible_by = 1
 tokenization.tokenizer = LazyCall(BaichuanTokenizer)(
-    # pretrained_model_path=cfg.pretrained_model_path + "/tokenizer.model"
+    pretrained_model_path=cfg.pretrained_model_path + "/tokenizer.model"
 )
diff --git a/projects/Baichuan/configs/baichuan_sft.py b/projects/Baichuan/configs/baichuan_sft.py
@@ -18,9 +18,9 @@
 
 # Hyperparameters
 weight_decay = 0.1
-learning_rate = 5e-5
-dataset_path = "./alpaca_data"
-pretrained_model_path = "/root/models/Llama-2-7b-chat-hf"
+learning_rate = 1e-5
+dataset_path = os.environ["DATA_DIR"]
+pretrained_model_path = os.environ["MODEL_DIR"]
 
 # graph & optim
 graph["enabled"] = False
@@ -61,19 +61,19 @@
 
 train.update(
     dict(
-        output_dir="./sft_result",
+        output_dir="./sft_result/baichuan",
         train_micro_batch_size=1,
         test_micro_batch_size=1,
-        train_epoch=3,
+        train_epoch=1,
         train_iter=1,
         log_period=1,
         warmup_ratio=1 / 3,
         num_accumulation_steps=8,
-        rdma_enabled=True,
-        amp=dict(enabled=False),
+        rdma_enabled=False,
+        amp=dict(enabled=True),
         train_with_fp16=True,
         activation_checkpoint=dict(enabled=True),
-        input_placement_device="cuda",
+        input_placement_device="xpu",
         checkpointer=dict(
             period=5000,
             max_to_keep=20,
@@ -83,7 +83,7 @@
             tensor_parallel_size=1,
             pipeline_parallel_size=1,
             pipeline_num_layers=cfg.hidden_layers,
-            device_type="cuda",
+            device_type="xpu",
         ),
         evaluation=dict(
             enabled=True,

diff --git a/projects/Baichuan/pipeline.py b/projects/Baichuan/pipeline.py
@@ -117,7 +117,7 @@ def main(config_file, model_path, mode, device):
     )
 
     text = [
-        "Give three tips for staying healthy.",
+        "Wikipedia is a free online",
     ]
     output = pipeline(inputs=text)
     if dist.is_main_process():

diff --git a/projects/Baichuan/utils/data_prepare.py b/projects/Baichuan/utils/data_prepare.py
@@ -18,9 +18,9 @@
 
 
 def prepare(
-    destination_path: Path = Path("./data/libai_xpu_alpaca"),
+    destination_path: Path = Path("./data/baichuan"),
     checkpoint_dir: Path = Path("/root/models/Baichuan2-7B-Chat"),
-    test_split_fraction: float = 0.03865,  # to get exactly 2000 test samples,
+    test_split_fraction: float = 0.60,  # to get exactly 2000 test samples,
     seed: int = 42,
     mask_inputs: bool = False,  # as in alpaca-lora
     data_file_name: str = "alpaca_data_cleaned_archive.json",
@@ -38,7 +38,7 @@ def prepare(
             max_seq_length = config["max_position_embeddings"]
 
     destination_path.mkdir(parents=True, exist_ok=True)
-    data_file_path = destination_path / data_file_name
+    data_file_path = Path(data_file_name)
     logger.info("Loading data file...")
     download_if_missing(data_file_path, data_file_url)
     with open(data_file_path, "r", encoding="utf-8") as file:
@@ -118,7 +118,7 @@ def prepare_sample(example: dict, tokenizer, max_length: int) -> dict:
 
     padding = max_length - example.shape[0]
     if padding > 0:
-        example = flow.cat((example, flow.zeros(padding, dtype=flow.long) - 1))
+        example = flow.cat((example.to_local(), flow.zeros(padding, dtype=flow.long) - 1))
     elif padding < 0:
         example = example[:max_length]
     labels = copy.deepcopy(example)
@@ -129,10 +129,16 @@ def prepare_sample(example: dict, tokenizer, max_length: int) -> dict:
     labels[~label_mask] = -1
     example = example[:-1]
     labels = labels[1:]
+    if example_mask.is_global:
+        example_mask = example_mask.to_local()
     example_mask = flow.where(
         example_mask, flow.tensor(0, dtype=flow.float), flow.tensor(-float("inf"))
     )
     example_mask = example_mask[:-1]
+    if example.is_global:
+        example = example.to_local()
+    if labels.is_global:
+        labels = labels.to_local()
     return {
         "input_ids": example,
         "labels": labels,

diff --git a/projects/ChatGLM/chatglm.py b/projects/ChatGLM/chatglm.py
@@ -180,7 +180,7 @@ def scaled_dot_product_attention(
     def forward(self, query_layer, key_layer, value_layer, attention_mask=None):
         # query_layer: [sq, b, np, hn] -[premute]-> [batch_size, head_num, seq_len, hidden_size]
         query_layer, key_layer, value_layer = [
-            k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]
+            k.transpose(1, 2).transpose(0, 2) for k in [query_layer, key_layer, value_layer]
         ]
         if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
             context_layer = self.scaled_dot_product_attention(
@@ -194,7 +194,7 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask=None):
                 query_layer, key_layer, value_layer, attention_mask
             )
 
-        context_layer = context_layer.permute(2, 0, 1, 3)
+        context_layer = context_layer.transpose(0, 1).transpose(0, 2)
         context_layer = context_layer.flatten(2)
         return context_layer
 
@@ -709,7 +709,7 @@ def get_prompt(self, batch_size):
         )
         # seq_len, b, nh, hidden_size
         past_key_values = self.dropout(past_key_values)
-        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
+        past_key_values = past_key_values.transpose(0, 2).split(2)
         return past_key_values
 
     def forward(

diff --git a/projects/ChatGLM/configs/chatglm_sft.py b/projects/ChatGLM/configs/chatglm_sft.py
@@ -17,15 +17,15 @@
 
 # Hyperparameters
 weight_decay = 0.1
-learning_rate = 2e-5
+learning_rate = 1e-7
 max_source_len = 128
 max_target_len = 128
 max_length = 256
 dataset_path = os.environ["DATA_DIR"]
 pretrained_model_path = os.environ["CHATGLM_HF_DIR"]
 
 # graph & optim
-graph["enabled"] = True
+graph["enabled"] = False
 
 optim.update(
     dict(
@@ -71,26 +71,28 @@
 
 train.update(
     dict(
-        output_dir="./sft_result",
+        output_dir="./sft_result/chatglm",
         train_micro_batch_size=1,
         test_micro_batch_size=1,
-        train_epoch=3,
+        train_epoch=1,
         train_iter=1,
-        log_period=10,
+        log_period=1,
         warmup_ratio=2 / 5,
         num_accumulation_steps=8,
-        rdma_enabled=True,
+        rdma_enabled=False,
         amp=dict(enabled=True),
         activation_checkpoint=dict(enabled=True),
+        input_placement_device="xpu",
         checkpointer=dict(
             period=5000,
             max_to_keep=1,
         ),
         dist=dict(
             data_parallel_size=1,
             tensor_parallel_size=1,
-            pipeline_parallel_size=4,
+            pipeline_parallel_size=1,
             pipeline_num_layers=cfg.num_layers,
+            device_type="xpu",
         ),
         evaluation=dict(
             enabled=False,

diff --git a/projects/ChatGLM/dataset.py b/projects/ChatGLM/dataset.py
@@ -21,10 +21,8 @@
 
 from libai.data.structures import DistTensorData, Instance
 from libai.utils import distributed as dist
-from libai.utils.logger import setup_logger
 
 IGNORE_INDEX = -100
-logger = setup_logger()
 
 
 class ChatGLMTrainDataset(Dataset):
@@ -40,7 +38,6 @@ def __init__(self, path, tokenizer, max_source_len=128, max_target_len=128, max_
             self.max_len = max_length
 
         example = self._preprocess(0)
-        self.log_dataset_example(example)
 
     def _preprocess(self, idx):
         # inputs with format `<bos> X Y <eos>` labels with format `<ignore> ... <ignore> Y <eos>`
@@ -71,23 +68,6 @@ def _preprocess(self, idx):
 
         return {"input_ids": input_ids, "labels": labels}
 
-    def log_dataset_example(self, example: Dict[str, List[int]]) -> None:
-        if dist.is_main_process():
-            logger.info("input_ids:\n{}".format(example["input_ids"]))
-            logger.info(
-                "inputs:\n{}".format(
-                    self.tokenizer.decode(example["input_ids"], skip_special_tokens=False)
-                )
-            )
-            logger.info("label_ids:\n{}".format(example["labels"]))
-            logger.info(
-                "labels:\n{}".format(
-                    self.tokenizer.decode(
-                        list(filter(lambda x: x != IGNORE_INDEX, example["labels"])),
-                        skip_special_tokens=False,
-                    )
-                )
-            )
 
     def __len__(self):
         return len(self.data["prompt"])

diff --git a/projects/ChatGLM/pipeline.py b/projects/ChatGLM/pipeline.py
@@ -188,7 +188,6 @@ def main(config_file, model_path, mode, device):
         "a dog is flying on the sky",
         "Wikipedia is a free online",
         "what is beam search?",
-        "what is beam search?",
     ]
     pipeline = TextGenerationPipeline(
         config_file,

diff --git a/projects/ChatGLM/utils/prepare_data_alpaca.py b/projects/ChatGLM/utils/prepare_data_alpaca.py
@@ -16,7 +16,7 @@
 
 def prepare(
     destination_path: Path = Path(os.environ["DATA_DIR"]),
-    test_split_fraction: float = 0.03865,  # to get exactly 2000 test samples,
+    test_split_fraction: float = 0.60,  # to get exactly 2000 test samples,
     seed: int = 42,
     data_file_name: str = "alpaca_data_cleaned_archive.json",
     data_file_url: str = "https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json",  # noqa

diff --git a/projects/Llama/configs/llama_config.py b/projects/Llama/configs/llama_config.py
@@ -48,7 +48,7 @@
     eos_token_id=2,
     pad_token_id=0,
     # train
-    pretrained_model_path="meta-llama/Llama-2-7b-hf",
+    pretrained_model_path="/root/models/Llama-2-7b-chat-hf",
 )
 
 cfg = DictConfig(cfg)
@@ -57,5 +57,5 @@
 tokenization = OmegaConf.create()
 tokenization.make_vocab_size_divisible_by = 1
 tokenization.tokenizer = LazyCall(LlamaTokenizer)(
-    # pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model"
+    pretrained_model_path="/root/models/Llama-2-7b-chat-hf/tokenizer.model"
 )
diff --git a/projects/Llama/configs/llama_sft.py b/projects/Llama/configs/llama_sft.py
@@ -19,8 +19,8 @@
 # Hyperparameters
 weight_decay = 0.1
 learning_rate = 5e-5
-dataset_path = "alpaca_data"
-pretrained_model_path = "meta-llama/Llama-2-7b-hf"
+dataset_path = "./data/llama"
+pretrained_model_path = "/root/models/Llama-2-7b-chat-hf"
 
 # graph & optim
 graph["enabled"] = False
@@ -61,26 +61,28 @@
 
 train.update(
     dict(
-        output_dir="./sft_result",
-        train_micro_batch_size=4,
+        output_dir="./sft_result/llama",
+        train_micro_batch_size=1,
         test_micro_batch_size=1,
-        train_epoch=3,
+        train_epoch=1,
         train_iter=1,
-        log_period=10,
+        log_period=1,
         warmup_ratio=1 / 3,
         num_accumulation_steps=8,
         rdma_enabled=False,
         amp=dict(enabled=True),
         activation_checkpoint=dict(enabled=True),
+        input_placement_device="xpu",
         checkpointer=dict(
             period=5000,
             max_to_keep=20,
         ),
         dist=dict(
             data_parallel_size=1,
             tensor_parallel_size=1,
-            pipeline_parallel_size=8,
+            pipeline_parallel_size=1,
             pipeline_num_layers=cfg.hidden_layers,
+            device_type="xpu",
         ),
         evaluation=dict(
             enabled=True,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		python projects/Aquila/pipeline.py --model_path=/root/models/Aquila-7B --mode=huggingface --device=xpu
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		python projects/Baichuan/pipeline.py --model_path=/root/models/Baichuan2-7B-Chat --mode=huggingface --device=xpu
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		python projects/ChatGLM/pipeline.py --model_path=/root/models/chatglm2-6b --mode=huggingface --device=xpu
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		python projects/Llama/pipeline.py --model_path=/root/models/Llama-2-7b-chat-hf --mode=huggingface --device=xpu
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		python projects/Qwen/pipeline.py --model_path=/root/models/Qwen1.5-7B-Chat --mode=huggingface --device=xpu