fixed tokenization

l-k-11235 · l-k-11235 · commit 68c22af0cd74 · 2024-01-22T16:48:12.000+01:00
diff --git a/eval_llm/WIKITEXT2/run_wikitext-2_benchmark.py b/eval_llm/WIKITEXT2/run_wikitext-2_benchmark.py
@@ -10,19 +10,15 @@
 
 def tokenize_dataset(opt, context_length):
     print("Tokenization...")
-
     # Clean and Concat the dataset
     x = open(opt.src, "r").readlines()
     xx = [_x for _x in x if _x != " \n"]
-    print(xx[:2])
     from onmt.transforms.tokenize import SentencePieceTransform
 
     tokenizer = SentencePieceTransform(opt)
     tokenizer.warm_up()
     tokens = tokenizer._tokenize(xx)
     print("Done !")
-    print(len(tokens))
-    print(tokens[:100])
     return tokens
 
 
@@ -46,23 +42,21 @@ def evaluate(opt):
 
     # Score the dataset.
     stride = 512
-    max_seq_length = 4096
     max_seq_length = 2048
+    engine_opt.batch_type = "sents"
+    engine_opt.batch_size = 1
     seq_len = len(tokens)
-    print("seq_len: ", seq_len)
     score_results = []
     nlls = []
     src = []
     for begin_loc in range(0, seq_len, stride):
-        end_loc = min(begin_loc + max_seq_length - 1, seq_len)
+        end_loc = min(begin_loc + max_seq_length, seq_len)
         src.append(" ".join(tokens[begin_loc:end_loc]))
-
     start_time = time.time()
     score_results = engine.score_list(src=src)
     nlls = [_score for (_score, _length) in score_results]
     lengths = [_length for (_score, _length) in score_results]
     ppl = np.exp(-np.sum(nlls) / np.sum(lengths))
-    print(ppl)
     engine.terminate()
     end_time = time.time()
     logger.info("total run time %.2f" % (end_time - start_time))
diff --git a/onmt/translate/translator.py b/onmt/translate/translator.py
@@ -584,7 +584,7 @@ def _score(self, infer_iter):
         self.with_scores = True
         scored_bucket = {}
         for batch, bucket_idx in infer_iter:
-            batch_data = self.translate_batch(batch, attn_debug=False)
+            batch_data = self.translate_batch(batch, attn_debug=False, scoring=True)
             batch_gold_scores = batch_data["gold_score"].cpu().numpy().tolist()
             batch_tgt_lengths = batch["tgtlen"].cpu().numpy().tolist()
             batch_inds_in_bucket = batch["ind_in_bucket"]
@@ -1001,8 +1001,9 @@ def _align_forward(self, batch, predictions):
         """
         raise NotImplementedError
 
-    def translate_batch(self, batch, attn_debug):
+    def translate_batch(self, batch, attn_debug, scoring=False):
         """Translate a batch of sentences."""
+        max_length = 0 if scoring else self.max_length
         with torch.no_grad():
             if self.sample_from_topk != 0 or self.sample_from_topp != 0:
                 decode_strategy = GreedySearchLM(
@@ -1015,7 +1016,7 @@ def translate_batch(self, batch, attn_debug):
                     batch_size=len(batch["srclen"]),
                     global_scorer=self.global_scorer,
                     min_length=self.min_length,
-                    max_length=self.max_length,
+                    max_length=max_length,
                     block_ngram_repeat=self.block_ngram_repeat,
                     exclusion_tokens=self._exclusion_idxs,
                     return_attention=attn_debug or self.replace_unk,
@@ -1039,7 +1040,7 @@ def translate_batch(self, batch, attn_debug):
                     n_best=self.n_best,
                     global_scorer=self.global_scorer,
                     min_length=self.min_length,
-                    max_length=self.max_length,
+                    max_length=max_length,
                     return_attention=attn_debug or self.replace_unk,
                     block_ngram_repeat=self.block_ngram_repeat,
                     exclusion_tokens=self._exclusion_idxs,