Skip to content

Commit

Permalink
black format
Browse files Browse the repository at this point in the history
  • Loading branch information
ShawnXuan committed Sep 19, 2024
1 parent 1362ca1 commit acb337a
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 6 deletions.
4 changes: 2 additions & 2 deletions projects/Qwen/configs/qwen_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
tokenization = OmegaConf.create()
tokenization.make_vocab_size_divisible_by = 1
tokenization.tokenizer = LazyCall(Qwen2Tokenizer)(
vocab_file=pretrained_model_path+"/vocab.json",
merges_file=pretrained_model_path+"/merges.txt",
vocab_file=pretrained_model_path + "/vocab.json",
merges_file=pretrained_model_path + "/merges.txt",
)


Expand Down
1 change: 1 addition & 0 deletions projects/Qwen/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def _parse_parameters(self, **pipeline_parameters):
def preprocess(self, inputs, **kwargs) -> dict:
# tokenizer encoderW
import oneflow as flow

inputs = flow.tensor(self.tokenizer.encode(inputs, add_bos=True, padding=True))

inputs = {
Expand Down
6 changes: 2 additions & 4 deletions projects/Qwen/utils/prepare_alpaca.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,10 @@ def prepare_sample(example: dict, tokenizer, max_length: int) -> dict:
"""
full_prompt = generate_prompt(example)
full_prompt_and_response = full_prompt + example["output"]

prompt = tokenizer.encode(full_prompt, device="cpu")
prompt = flow.tensor(prompt, dtype=flow.int, device="cpu")
example = tokenizer.encode(
full_prompt_and_response, device="cpu"
)
example = tokenizer.encode(full_prompt_and_response, device="cpu")
example = flow.tensor(example, dtype=flow.int, device="cpu")

padding = max_length - example.shape[0]
Expand Down

0 comments on commit acb337a

Please sign in to comment.