Refactor loss function in LM workloads to unify label handling and improve clarity

rka97 · rka97 · commit f6a705d41c4c · 2025-10-21T04:33:40.000Z
diff --git a/algoperf/workloads/lm/lm_jax/workload.py b/algoperf/workloads/lm/lm_jax/workload.py
@@ -87,35 +87,38 @@ def model_fn(
     logits = self._model.apply({'params': params}, inputs)
     return logits, None
 
-  def compute_weighted_cross_entropy(
+  def loss_fn(
     self,
-    logits: spec.Tensor,
-    targets: spec.Tensor,
-    weights: Optional[spec.Tensor] = None,
+    label_batch: spec.Tensor,
+    logits_batch: spec.Tensor,
+    mask_batch: Optional[spec.Tensor] = None,
     label_smoothing: float = 0.0,
   ) -> Dict[str, spec.Tensor]:  # differentiable
-    """Compute weighted cross entropy and entropy for log probs and targets.
+    """Compute weighted cross entropy.
+
     Args:
-     logits: [batch, length, num_classes] float array.
-     targets: categorical targets [batch, length] int array.
-     weights: array of shape [batch, length].
-     label_smoothing: label smoothing constant, used to determine the on and off
-       values.
+     label_batch: categorical targets [batch, length] int array.
+     logits_batch: [batch, length, num_classes] float array.
+     mask_batch: weights array of shape [batch, length].
+     label_smoothing: Label smoothing factor in [0, 1]. When > 0, the target
+      distribution becomes (1 - label_smoothing) for the correct class and
+      label_smoothing / vocab_size for all other classes. Default is 0.0 (no smoothing).
+
     Returns:
       {'summed': scalar summed loss, 'n_valid_examples': scalar number of
-      valid examples in batch, 'per_example': 1-d array of per-example losses}
+      valid examples in batch, 'per_example': 2d array of per-example losses}
     """
-    if logits.ndim != targets.ndim + 1:
+    if logits_batch.ndim != label_batch.ndim + 1:
       raise ValueError(
-        f'Incorrect shapes. Got shape {logits.shape} logits and '
-        f'{targets.shape} targets.'
+        f'Incorrect shapes. Got shape {logits_batch.shape} logits and '
+        f'{label_batch.shape} targets.'
       )
     # Compute log probabilities
-    log_probs = jax.nn.log_softmax(logits, axis=-1)
+    log_probs = jax.nn.log_softmax(logits_batch, axis=-1)
     # Extract log probability of the target class
     # Shape: [batch, length]
     target_log_probs = jnp.take_along_axis(
-      log_probs, targets[..., None], axis=-1
+      log_probs, label_batch[..., None], axis=-1
     ).squeeze(-1)
     # Cross-entropy with smoothing: -(1 - α) * log_p[target] - α * mean(log_p)
     # The above formula is easy to derive from the definition of label smoothing and cross-entropy loss.
@@ -124,11 +127,11 @@ def compute_weighted_cross_entropy(
     per_example_losses = -1.0 * (
       confidence * target_log_probs + smoothing_term * log_probs.sum(axis=-1)
     )
-    if weights is not None:
-      per_example_losses = jnp.where(weights, per_example_losses, 0.0)
-      n_valid_examples = weights.sum()
+    if mask_batch is not None:
+      per_example_losses = mask_batch * per_example_losses 
+      n_valid_examples = mask_batch.sum()
     else:
-      n_valid_examples = targets.shape[0] * targets.shape[1]
+      n_valid_examples = label_batch.shape[0] * label_batch.shape[1]
     summed_loss = per_example_losses.sum()
     return {
       'summed': summed_loss,
@@ -147,12 +150,9 @@ def _eval_batch(
     logits, _ = self.model_fn(
       params, batch, model_state, spec.ForwardPassMode.EVAL, rng, False
     )
-    # Calculate cross-entropy loss
-    metrics = self.compute_weighted_cross_entropy(
-      logits, batch['targets'], batch['weights']
+    metrics = self.loss_fn(
+      label_batch=batch['targets'], logits_batch=logits, mask_batch=batch['weights']
     )
-    # CRITICAL: Detach tensors to free computation graph and activations
-    # Without this, all intermediate activations are kept in memory!
     return {
       'loss': metrics['summed'],
       'denominator': metrics['n_valid_examples'],
diff --git a/algoperf/workloads/lm/lm_pytorch/workload.py b/algoperf/workloads/lm/lm_pytorch/workload.py
@@ -133,43 +133,60 @@ def is_output_params(self, param_name: str) -> bool:
     """Return whether the given parameter is an output parameter."""
     return 'lm_head.weight' in param_name or 'lm_head.bias' in param_name
 
-  # FIXME(rka97): Implement label smoothing
-  def compute_weighted_cross_entropy(
-    self,
-    logits: spec.Tensor,
-    labels: spec.Tensor,
-    weights: spec.Tensor,
-    label_smoothing: float = 0.0,
-  ) -> Dict[str, spec.Tensor]:
-    """Compute cross-entropy loss for language modeling in PyTorch."""
-    vocab_size = logits.size(-1)
-
-    if len(labels.shape) == len(logits.shape):
-      # One-hot labels
-      log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
-      loss = -torch.sum(labels * log_probs, dim=-1)
-    else:
-      # Dense labels
-      loss = torch.nn.functional.cross_entropy(
-        logits.view(-1, vocab_size), labels.view(-1), reduction='none'
+  def loss_fn(
+      self,
+      label_batch: spec.Tensor,
+      logits_batch: spec.Tensor,
+      mask_batch: spec.Tensor,
+      label_smoothing: float = 0.0,
+    ) -> Dict[str, spec.Tensor]:
+      """Compute weighted cross-entropy loss.
+      
+      Args:
+        label_batch: Target labels of shape [batch, length] (int).
+        logits_batch: Predicted logits of shape [batch, length, vocab_size] (float). 
+        mask_batch: Optional weights of shape [batch, length] (float). Used to mask
+          out padding tokens or weight examples differently. If None, all examples
+          are weighted equally.
+        label_smoothing: Label smoothing factor in [0, 1]. When > 0, the target
+          distribution becomes (1 - label_smoothing) for the correct class and
+          label_smoothing / vocab_size for all other classes. Default is 0.0 (no smoothing).
+      
+      Returns:
+        Dictionary containing:
+          - 'summed': Scalar tensor with the sum of all weighted losses.
+          - 'n_valid_examples': Scalar tensor with the count of valid (non-masked) examples.
+          - 'per_example': Tensor of shape [batch, length] with individual losses per example.
+      """
+      vocab_size = logits_batch.size(-1)
+      
+      # Compute cross-entropy loss with label smoothing
+      per_example_losses = torch.nn.functional.cross_entropy(
+        logits_batch.view(-1, vocab_size), 
+        label_batch.view(-1), 
+        reduction='none', 
+        label_smoothing=label_smoothing
       )
-      loss = loss.view_as(labels)
-
-    if weights is not None:
-      loss = loss * weights
-
-    n_valid = (
-      weights.sum()
-      if weights is not None
-      else torch.tensor(
-        labels.numel(), dtype=torch.float32, device=labels.device
+      per_example_losses = per_example_losses.view_as(label_batch)
+      
+      # Apply weights if provided
+      if mask_batch is not None:
+        per_example_losses = per_example_losses * mask_batch
+      
+      # Calculate number of valid examples
+      n_valid_examples = (
+        mask_batch.sum()
+        if mask_batch is not None
+        else torch.tensor(
+          label_batch.numel(), dtype=torch.float32, device=label_batch.device
+        )
       )
-    )
-    return {
-      'summed': loss.sum(),
-      'n_valid_examples': n_valid,
-      'per_example': loss,
-    }
+      
+      return {
+        'summed': per_example_losses.sum(),
+        'n_valid_examples': n_valid_examples,
+        'per_example': per_example_losses,
+      }
 
   def _eval_batch(
     self,
@@ -182,8 +199,8 @@ def _eval_batch(
     logits, _ = self.model_fn(
       params, batch, model_state, spec.ForwardPassMode.EVAL, rng, False
     )
-    metrics = self.compute_weighted_cross_entropy(
-      logits, batch['targets'], batch['weights']
+    metrics = self.loss_fn(
+      label_batch=batch['targets'], logits_batch=logits, mask_batch=batch['weights']
     )
     return {
       'loss': metrics['summed'].detach(),
diff --git a/algoperf/workloads/lm/workload.py b/algoperf/workloads/lm/workload.py
@@ -125,6 +125,16 @@ def _build_input_queue(
   ) -> Iterator[Dict[str, Any]]:
     """Build an input queue for the given split."""
 
+  @abc.abstractmethod
+  def _eval_batch(
+    self,
+    params: spec.ParameterContainer,
+    eval_batch: Dict[str, spec.Tensor],
+    model_state: spec.ModelAuxiliaryState,
+    rng: spec.RandomState,
+  ) -> Dict[str, float]:
+    """Evaluate the model on a single batch."""
+
   def _eval_model_on_split(
     self,
     split: str,
@@ -168,20 +178,15 @@ def _normalize_eval_metrics(
   ) -> Dict[str, float]:
     """Normalize eval metrics."""
 
+  @abc.abstractmethod
   def loss_fn(
     self,
     label_batch: spec.Tensor,
     logits_batch: spec.Tensor,
     mask_batch: Optional[spec.Tensor] = None,
     label_smoothing: float = 0.0,
   ) -> Dict[str, spec.Tensor]:
-    """Compute cross-entropy loss for language modeling in JAX."""
-    return self.compute_weighted_cross_entropy(
-      logits_batch,
-      label_batch,
-      weights=mask_batch,
-      label_smoothing=label_smoothing,
-    )
+    """Compute cross-entropy loss for language modeling."""
 
   def is_output_params(self, param_name: str) -> bool:
     """Return whether the given parameter is an output parameter."""