convert_examples_to_features_bos_eos

这段if len(tokens) + len(word_tokens) > max_seq_length - 1:的逻辑是不是有点问题，我输入一条，他是不convert的？
print(f"Converting examples to features... bos_id:{tokenizer.piece_to_id('<s>')}, eos_id:{tokenizer.piece_to_id('</s>')}")
		for il, line in enumerate(tqdm(self.text_lines)):
			words = line.split()

			tokens_num = self.getTokensNum(words, tokenizer)
			if tokens_num < max_seq_length - 10:
				for iw, word in enumerate(words):
					word_tokens = tokenizer.encode(word, out_type=int)

					if len(tokens) + len(word_tokens) > max_seq_length - 1:

						tokens.append(tokenizer.piece_to_id('</s>'))
						labels[0].append(0)
						labels[1].append(0)
						valid.append(1)

						token_masks = [1] * len(tokens)
						label_masks = [1] * len(labels[0])
						label_len = len(labels[0])

						while len(tokens) < max_seq_length:
							tokens.append(0)
							token_masks.append(0)
							valid.append(0)
						while len(labels[0]) < max_seq_length:
							labels[0].append(0)
							labels[1].append(0)
							label_masks.append(0)
						assert len(tokens) == max_seq_length
						assert len(token_masks) == max_seq_length
						assert len(valid) == max_seq_length
						assert len(labels[0]) == max_seq_length
						assert len(labels[1]) == max_seq_length
						assert len(label_masks) == max_seq_length

						self.features.append( InputFeatures(token_ids = tokens,
													   label_ids = labels,
													   valid_ids = valid,
													   token_masks = token_masks,
													   label_masks = label_masks,
													   label_len = label_len) )

						tokens = [tokenizer.piece_to_id('<s>')]
						labels = [[0], [0]]
						valid = [1]
						token_masks = []
						label_masks = []
						label_len = 0

						### iw = 0 means this word is the start of a new sentence, no need to insert last part sentence
						if iw > 0:
							tokens.extend(self.last_part_sentence.tokens)
							labels[0].extend(self.last_part_sentence.labels[0])
							labels[1].extend(self.last_part_sentence.labels[1])
							valid.extend(self.last_part_sentence.valid)
					
					if iw == 0:
						self.last_part_sentence.tokens = []
						self.last_part_sentence.labels = [[], []]
						self.last_part_sentence.valid = []

					tokens.extend(word_tokens)
					self.last_part_sentence.tokens.extend(word_tokens)
					for m in range(len(word_tokens)):
						if m == 0:
							labels[0].append(self.label_lines[0][il][iw])
							labels[1].append(self.label_lines[1][il][iw])
							valid.append(1)

							self.last_part_sentence.labels[0].append(self.label_lines[0][il][iw])
							self.last_part_sentence.labels[1].append(self.label_lines[1][il][iw])
							self.last_part_sentence.valid.append(1)
						else:
							valid.append(0)

							self.last_part_sentence.valid.append(0)
			else:
				print(f"tokens num:[{tokens_num}] ----> {line}")

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

convert_examples_to_features_bos_eos #5

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

convert_examples_to_features_bos_eos #5

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions