Improve metric calculation and logging

fynnsu · fynnsu · commit 2cd5b7a8b417 · 2025-10-28T21:21:02.000Z
Signed-off-by: Fynn Schmitt-Ulms &lt;fschmitt@redhat.com&gt;
diff --git a/src/speculators/train/eagle3/core.py b/src/speculators/train/eagle3/core.py
@@ -20,6 +20,7 @@ def align_for_step(
     logits: torch.Tensor,  # shape: [1, total_seq_len, draft_vocab_size]
     targets: torch.Tensor,  # shape: [1, total_seq_len, draft_vocab_size]
     loss_mask: torch.Tensor | None,  # shape: [1, total_seq_len]
+    prev_correct: torch.Tensor | None,  # shape: [1, total_seq_len]
     ttt_step: int,
 ):
     # There are no target values for the last ttt_step tokens, so we mask them out
@@ -40,24 +41,38 @@ def align_for_step(
     if loss_mask is not None:
         loss_mask = loss_mask[:, ttt_step:]
         # shape: [1, total_seq_len - ttt_step]
-    return logits, targets, loss_mask
+    if prev_correct is not None:
+        # Align with draft starts
+        prev_correct = prev_correct[:, :-ttt_step] if ttt_step > 0 else prev_correct
+        # shape: [1, total_seq_len - ttt_step]
+    return logits, targets, loss_mask, prev_correct
 
 
 @torch.no_grad()
 def compute_accuracy(
     logits: torch.Tensor,  # shape: [1, total_seq_len - ttt_step, draft_vocab_size]
     targets: torch.Tensor,  # shape: [1, total_seq_len - ttt_step, draft_vocab_size]
     loss_mask: torch.Tensor | None,  # shape: [1, total_seq_len - ttt_step]
+    prev_correct: torch.Tensor | None,  # shape: [1, total_seq_len - ttt_step]
 ):
     # Note: logits, targets, and loss_mask are already aligned for the current ttt_step
     target_tokens = torch.argmax(targets, dim=-1)
     predicted_tokens = torch.argmax(logits, dim=-1)
     # shape: [1, total_seq_len - ttt_step]
 
     correct = predicted_tokens == target_tokens
+    cond_denom: torch.Tensor | int = correct.numel()
+    if prev_correct is not None:
+        cond_denom = prev_correct.sum()
+        # Update prev_correct in place
+        correct = torch.logical_and(prev_correct, correct, out=prev_correct)
     if loss_mask is not None:
         correct = torch.masked_select(correct, loss_mask.to(torch.bool))
-    return correct.float().sum() / (correct.numel() + 1e-5)
+
+    correct_sum = correct.float().sum()
+    full_denom = correct.numel()
+
+    return correct_sum / (full_denom + 1e-5), correct_sum / (cond_denom + 1e-5)
 
 
 def loss_function(
@@ -235,8 +250,13 @@ def forward(
                 # shape: [1, total_seq_len, draft_vocab_size]
 
         loss = torch.tensor(0.0, device=device)
+        prev_correct = (
+            loss_mask.clone()
+            if loss_mask is not None
+            else torch.ones(1, total_seq_len, device=device, dtype=torch.bool)
+        )
         draft_tokens = []
-        accuracy_list = []
+        metrics = {}
         for ttt_step in range(ttt_steps):
             with torch.no_grad():
                 input_embeds = self.embed_tokens(input_ids)
@@ -269,12 +289,19 @@ def forward(
             # shape: [1, total_seq_len, draft_vocab_size]
 
             if return_loss:
-                s_logits, s_targets, s_loss_mask = align_for_step(
-                    logits, target_logits, loss_mask, ttt_step
+                s_logits, s_targets, s_loss_mask, s_prev_correct = align_for_step(
+                    logits, target_logits, loss_mask, prev_correct, ttt_step
                 )
                 loss_weight = self.ttt_step_loss_decay**ttt_step
-                loss += loss_weight * loss_function(s_logits, s_targets, s_loss_mask)
-                accuracy_list.append(compute_accuracy(s_logits, s_targets, s_loss_mask))
+                s_loss = loss_weight * loss_function(s_logits, s_targets, s_loss_mask)
+                loss += s_loss
+
+                s_full_acc, s_cond_acc = compute_accuracy(
+                    s_logits, s_targets, s_loss_mask, s_prev_correct
+                )
+                metrics[f"loss_{ttt_step}"] = s_loss.detach().clone()
+                metrics[f"full_acc_{ttt_step}"] = s_full_acc
+                metrics[f"cond_acc_{ttt_step}"] = s_cond_acc
 
             input_ids = torch.argmax(logits, dim=-1)
             draft_tokens.append(input_ids.detach().clone())
@@ -303,11 +330,9 @@ def forward(
             position_ids = position_ids + 1
             # shape: [1, total_seq_len]
 
+        metrics["loss"] = loss.detach().clone()
+
         if return_loss:
-            return (
-                draft_tokens,
-                loss,
-                torch.tensor(accuracy_list, device=device, dtype=torch.float),
-            )
+            return draft_tokens, loss, metrics
         else:
             return draft_tokens
diff --git a/src/speculators/train/trainer.py b/src/speculators/train/trainer.py
@@ -118,7 +118,7 @@ def train_epoch(self, epoch: int):
                 for k, v in batch.items()
             }
 
-            _draft_tokens, loss, draft_accuracies = self.model(
+            _draft_tokens, loss, metrics = self.model(
                 **gpu_batch, **self.config.train_call_kwargs
             )
 
@@ -127,18 +127,13 @@ def train_epoch(self, epoch: int):
             torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
             self.opt.step()
 
-            loss = loss.detach().clone()
             if self.is_distributed:
-                # Note: this is not needed for training, just for logging
-                dist.reduce(loss, dst=0, op=dist.ReduceOp.AVG)
-                dist.reduce(draft_accuracies, dst=0, op=dist.ReduceOp.AVG)
+                for v in metrics.values():
+                    dist.reduce(v, dst=0, op=dist.ReduceOp.AVG)
 
-            acc_values = {
-                f"acc_{i}": acc.item() for i, acc in enumerate(draft_accuracies)
-            }
+            metrics = {k: v.item() for k, v in metrics.items()}
             metric_logger.info(
-                {"train": {"loss": loss.item(), **acc_values}, "epoch": epoch},
-                extra={"step": self.global_step},
+                {"train": metrics, "epoch": epoch}, extra={"step": self.global_step}
             )
             self.global_step += 1
 
@@ -152,36 +147,29 @@ def val_epoch(self, epoch: int):
         val_loader = self.val_loader
         if self.local_rank == 0:
             val_loader = tqdm(val_loader, desc=f"Epoch {epoch}")  # type: ignore[assignment]
-        val_loss = torch.zeros(1, device=self.local_rank)
-        val_accuracies = torch.zeros(
-            (), device=self.local_rank
-        )  # initialize to tensor of shape ()
+
+        val_metrics: dict[str, float] = {}
+        num_batches = len(val_loader)
         for batch in val_loader:
             gpu_batch = {
                 k: v.to(self.local_rank) if isinstance(v, torch.Tensor) else v
                 for k, v in batch.items()
             }
 
-            _draft_tokens, loss, draft_accuracies = self.model(
+            _draft_tokens, _loss, metrics = self.model(
                 **gpu_batch, **self.config.val_call_kwargs
             )
 
             if self.is_distributed:
-                dist.reduce(val_loss, dst=0, op=dist.ReduceOp.AVG)
-                dist.reduce(draft_accuracies, dst=0, op=dist.ReduceOp.AVG)
-
-            val_loss += loss.detach().clone()
-            # Can't use += here because val_accuracies has shape () on first iteration
-            val_accuracies = val_accuracies + draft_accuracies.detach()
-
-        val_loss /= len(val_loader)
-        val_accuracies /= len(val_loader)
-        acc_values = {
-            f"acc_{i}_epoch": acc.item() for i, acc in enumerate(val_accuracies)
-        }
+                for v in metrics.values():
+                    dist.reduce(v, dst=0, op=dist.ReduceOp.AVG)
+
+            for k, v in metrics.items():
+                val_metrics[k] = val_metrics.get(k, 0.0) + v.item()
+
+        val_metrics = {f"{k}_epoch": v / num_batches for k, v in val_metrics.items()}
         metric_logger.info(
-            {"val": {"loss_epoch": val_loss.item(), **acc_values}, "epoch": epoch},
-            extra={"step": self.global_step},
+            {"val": val_metrics, "epoch": epoch}, extra={"step": self.global_step}
         )
 
     def save_checkpoint(self, epoch: int):