Skip to content

Commit aa501d9

Browse files
committed
Supports compatibility with GLM variant models, including both LLaMA and GPT-2 style tokenizers.
1 parent 164e34e commit aa501d9

File tree

1 file changed

+19
-7
lines changed

1 file changed

+19
-7
lines changed

convert_hf_to_gguf.py

+19-7
Original file line numberDiff line numberDiff line change
@@ -5054,16 +5054,28 @@ def set_vocab(self):
50545054
toktypes.append(gguf.TokenType.NORMAL)
50555055
tokens.append(token)
50565056

5057-
self.gguf_writer.add_tokenizer_model("llama")
50585057
self.gguf_writer.add_tokenizer_pre(tokpre)
50595058
self.gguf_writer.add_token_list(tokens)
50605059
self.gguf_writer.add_token_types(toktypes)
5061-
5062-
special_vocab=gguf.SpecialVocab(
5063-
self.dir_model,
5064-
load_merges=False,
5065-
n_vocab=vocab_size
5066-
)
5060+
try:
5061+
# for https://huggingface.co/THUDM/glm-4-9b
5062+
special_vocab=gguf.SpecialVocab(
5063+
self.dir_model,
5064+
load_merges=True,
5065+
n_vocab=vocab_size
5066+
)
5067+
5068+
self.gguf_writer.add_tokenizer_model("gpt2")
5069+
except Exception as e:
5070+
logger.warning(f'Failed to load special tokens: {e}')
5071+
# for https://huggingface.co/THUDM/glm-4-9b-hf
5072+
special_vocab=gguf.SpecialVocab(
5073+
self.dir_model,
5074+
load_merges=False,
5075+
n_vocab=vocab_size
5076+
)
5077+
self.gguf_writer.add_tokenizer_model("llama")
5078+
50675079
# only add special tokens when they were not already loaded from config.json
50685080

50695081
#TODO In llama.cpp, special tokens are mapped one-to-one between a token and a coordinate. However, in reality, a transformer might associate a special token like eos_token_id with multiple tokens.

0 commit comments

Comments
 (0)