Fix bucket refilling in _score methode of Inference class (#2557)

l-k-11235 · web-flow · commit 43c3300cc162 · 2024-01-30T15:26:04.000+01:00
* fixed score results overriding
* fixed bucket refilling in translator._score
diff --git a/onmt/translate/translator.py b/onmt/translate/translator.py
@@ -576,29 +576,35 @@ def _process_bucket(bucket_translations):
 
     def _score(self, infer_iter):
         self.with_scores = True
-        scored_bucket = {}
+        score_res = []
+        processed_bucket = {}
+        prev_bucket_idx = 0
         for batch, bucket_idx in infer_iter:
+            if bucket_idx != prev_bucket_idx:
+                prev_bucket_idx += 1
+                score_res += [item for _, item in sorted(processed_bucket.items())]
+                processed_bucket = {}
             batch_data = self.translate_batch(batch, attn_debug=False, scoring=True)
             batch_gold_scores = batch_data["gold_score"].cpu().numpy().tolist()
+            batch_tgt_lengths = batch["tgtlen"].cpu().numpy().tolist()
+            batch_inds_in_bucket = batch["ind_in_bucket"]
             if self.return_gold_log_probs:
                 batch_gold_log_probs = (
                     batch_data["gold_log_probs"].cpu().numpy().tolist()
                 )
             else:
-                batch_gold_log_probs = None
-            batch_tgt_lengths = batch["tgtlen"].cpu().numpy().tolist()
-            batch_inds_in_bucket = batch["ind_in_bucket"]
-            for i, _score in enumerate(batch_gold_scores):
-                log_probs = (
-                    batch_gold_log_probs[i] if self.return_gold_log_probs else None
-                )
-                scored_bucket[batch_inds_in_bucket[i]] = (
-                    _score,
-                    log_probs,
+                batch_gold_log_probs = [
+                    None for i, _ in enumerate(batch_inds_in_bucket)
+                ]
+            for i, ind in enumerate(batch_inds_in_bucket):
+                processed_bucket[ind] = [
+                    batch_gold_scores[i],
+                    batch_gold_log_probs[i],
                     batch_tgt_lengths[i],
-                )
-        score_results = [scored_bucket[i] for i in range(len(scored_bucket))]
-        return score_results
+                ]
+        if processed_bucket:
+            score_res += [item for _, item in sorted(processed_bucket.items())]
+        return score_res
 
     def _align_pad_prediction(self, predictions, bos, pad):
         """