Skip to content

convert_examples_to_features_bos_eos #5

Description

@deegy666

这段if len(tokens) + len(word_tokens) > max_seq_length - 1:的逻辑是不是有点问题,我输入一条,他是不convert的?
print(f"Converting examples to features... bos_id:{tokenizer.piece_to_id('')}, eos_id:{tokenizer.piece_to_id('')}")
for il, line in enumerate(tqdm(self.text_lines)):
words = line.split()

		tokens_num = self.getTokensNum(words, tokenizer)
		if tokens_num < max_seq_length - 10:
			for iw, word in enumerate(words):
				word_tokens = tokenizer.encode(word, out_type=int)

				if len(tokens) + len(word_tokens) > max_seq_length - 1:

					tokens.append(tokenizer.piece_to_id('</s>'))
					labels[0].append(0)
					labels[1].append(0)
					valid.append(1)

					token_masks = [1] * len(tokens)
					label_masks = [1] * len(labels[0])
					label_len = len(labels[0])

					while len(tokens) < max_seq_length:
						tokens.append(0)
						token_masks.append(0)
						valid.append(0)
					while len(labels[0]) < max_seq_length:
						labels[0].append(0)
						labels[1].append(0)
						label_masks.append(0)
					assert len(tokens) == max_seq_length
					assert len(token_masks) == max_seq_length
					assert len(valid) == max_seq_length
					assert len(labels[0]) == max_seq_length
					assert len(labels[1]) == max_seq_length
					assert len(label_masks) == max_seq_length

					self.features.append( InputFeatures(token_ids = tokens,
												   label_ids = labels,
												   valid_ids = valid,
												   token_masks = token_masks,
												   label_masks = label_masks,
												   label_len = label_len) )

					tokens = [tokenizer.piece_to_id('<s>')]
					labels = [[0], [0]]
					valid = [1]
					token_masks = []
					label_masks = []
					label_len = 0

					### iw = 0 means this word is the start of a new sentence, no need to insert last part sentence
					if iw > 0:
						tokens.extend(self.last_part_sentence.tokens)
						labels[0].extend(self.last_part_sentence.labels[0])
						labels[1].extend(self.last_part_sentence.labels[1])
						valid.extend(self.last_part_sentence.valid)
				
				if iw == 0:
					self.last_part_sentence.tokens = []
					self.last_part_sentence.labels = [[], []]
					self.last_part_sentence.valid = []

				tokens.extend(word_tokens)
				self.last_part_sentence.tokens.extend(word_tokens)
				for m in range(len(word_tokens)):
					if m == 0:
						labels[0].append(self.label_lines[0][il][iw])
						labels[1].append(self.label_lines[1][il][iw])
						valid.append(1)

						self.last_part_sentence.labels[0].append(self.label_lines[0][il][iw])
						self.last_part_sentence.labels[1].append(self.label_lines[1][il][iw])
						self.last_part_sentence.valid.append(1)
					else:
						valid.append(0)

						self.last_part_sentence.valid.append(0)
		else:
			print(f"tokens num:[{tokens_num}] ----> {line}")

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions