这段if len(tokens) + len(word_tokens) > max_seq_length - 1:的逻辑是不是有点问题,我输入一条,他是不convert的?
print(f"Converting examples to features... bos_id:{tokenizer.piece_to_id('')}, eos_id:{tokenizer.piece_to_id('')}")
for il, line in enumerate(tqdm(self.text_lines)):
words = line.split()
tokens_num = self.getTokensNum(words, tokenizer)
if tokens_num < max_seq_length - 10:
for iw, word in enumerate(words):
word_tokens = tokenizer.encode(word, out_type=int)
if len(tokens) + len(word_tokens) > max_seq_length - 1:
tokens.append(tokenizer.piece_to_id('</s>'))
labels[0].append(0)
labels[1].append(0)
valid.append(1)
token_masks = [1] * len(tokens)
label_masks = [1] * len(labels[0])
label_len = len(labels[0])
while len(tokens) < max_seq_length:
tokens.append(0)
token_masks.append(0)
valid.append(0)
while len(labels[0]) < max_seq_length:
labels[0].append(0)
labels[1].append(0)
label_masks.append(0)
assert len(tokens) == max_seq_length
assert len(token_masks) == max_seq_length
assert len(valid) == max_seq_length
assert len(labels[0]) == max_seq_length
assert len(labels[1]) == max_seq_length
assert len(label_masks) == max_seq_length
self.features.append( InputFeatures(token_ids = tokens,
label_ids = labels,
valid_ids = valid,
token_masks = token_masks,
label_masks = label_masks,
label_len = label_len) )
tokens = [tokenizer.piece_to_id('<s>')]
labels = [[0], [0]]
valid = [1]
token_masks = []
label_masks = []
label_len = 0
### iw = 0 means this word is the start of a new sentence, no need to insert last part sentence
if iw > 0:
tokens.extend(self.last_part_sentence.tokens)
labels[0].extend(self.last_part_sentence.labels[0])
labels[1].extend(self.last_part_sentence.labels[1])
valid.extend(self.last_part_sentence.valid)
if iw == 0:
self.last_part_sentence.tokens = []
self.last_part_sentence.labels = [[], []]
self.last_part_sentence.valid = []
tokens.extend(word_tokens)
self.last_part_sentence.tokens.extend(word_tokens)
for m in range(len(word_tokens)):
if m == 0:
labels[0].append(self.label_lines[0][il][iw])
labels[1].append(self.label_lines[1][il][iw])
valid.append(1)
self.last_part_sentence.labels[0].append(self.label_lines[0][il][iw])
self.last_part_sentence.labels[1].append(self.label_lines[1][il][iw])
self.last_part_sentence.valid.append(1)
else:
valid.append(0)
self.last_part_sentence.valid.append(0)
else:
print(f"tokens num:[{tokens_num}] ----> {line}")
这段if len(tokens) + len(word_tokens) > max_seq_length - 1:的逻辑是不是有点问题,我输入一条,他是不convert的?
print(f"Converting examples to features... bos_id:{tokenizer.piece_to_id('
')}, eos_id:{tokenizer.piece_to_id('')}")for il, line in enumerate(tqdm(self.text_lines)):
words = line.split()