Porting workload input pipeline to torch

rka97 · rka97 · commit d7a885cd7270 · 2025-10-17T04:01:11.000Z
- Added `limit_tf_threads` parameter to `pytorch_init` to control TensorFlow threading based on workload type. Dataloader was going OOM otherwise.
- Updated input pipeline to support "None" for weights (for memory).
- Modified Transformer model's `forward` method to optionally return loss during training. Should be better to fuse the loss later.
- Adjusted torch LM workload configuration for model dimensions and parameters to match jax.
- Updated transformers version in `pyproject.toml`, older version seems unavailable.
diff --git a/algoperf/pytorch_utils.py b/algoperf/pytorch_utils.py
@@ -27,7 +27,7 @@ def pytorch_setup() -> Tuple[bool, int, torch.device, int]:
   return use_pytorch_ddp, rank, device, n_gpus
 
 
-def pytorch_init(use_pytorch_ddp: bool, rank: int, profiler: Profiler) -> None:
+def pytorch_init(use_pytorch_ddp: bool, rank: int, profiler: Profiler, limit_tf_threads = True) -> None:
   # Make sure no GPU memory is preallocated to Jax.
   os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = 'false'
   # Only use CPU for Jax to avoid memory issues.
@@ -39,18 +39,16 @@ def pytorch_init(use_pytorch_ddp: bool, rank: int, profiler: Profiler) -> None:
 
   if use_pytorch_ddp:
     # Avoid tf input pipeline creating too many threads.
-    if rank != 0:
+    if rank != 0 and limit_tf_threads:
       tf.config.threading.set_intra_op_parallelism_threads(1)
       tf.config.threading.set_inter_op_parallelism_threads(1)
 
     torch.cuda.set_device(rank)
     profiler.set_local_rank(rank)
     # Only log once (for local rank == 0).
     if rank != 0:
-
       def logging_pass(*args):
         pass
-
       logging.info = logging_pass
     # Initialize the process group.
     dist.init_process_group('nccl')
diff --git a/algoperf/workloads/lm/input_pipeline.py b/algoperf/workloads/lm/input_pipeline.py
@@ -107,7 +107,13 @@ def get_lm_dataset(
     repeated_sequences_dataset = shuffled_sequences_ds.repeat()
     ds = repeated_sequences_dataset.batch(
       global_batch_size, drop_remainder=False
-    ).prefetch(tf.data.experimental.AUTOTUNE)
+    )
+    ds = ds.map(lambda x: {
+         'inputs': x['inputs'],
+         'targets': x['targets'],
+         'weights': None,
+     })
+    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
   elif split == 'eval_train':
     ds = batch_with_padding(
       sequences_ds,
diff --git a/algoperf/workloads/lm/lm_pytorch/plainlm_model.py b/algoperf/workloads/lm/lm_pytorch/plainlm_model.py
@@ -159,7 +159,7 @@ def __init__(self, cfg):
         if cfg.tie_embeddings:
             self.tie_weights()
 
-    def forward(self, x):
+    def forward(self, x, targets=None):
         # x: (bsz, seqlen)
         x = self.embed_tokens(x)  # (bsz, seqlen, dim)
         L = x.shape[1]
@@ -178,7 +178,12 @@ def forward(self, x):
 
         for layer in self.layers:
             x = layer(x, freqs_cis)  # (bsz, seqlen, dim)
-        return self.lm_head(self.out_norm(x))  # (bsz, seqlen, vocab_size)
+        out = self.lm_head(self.out_norm(x))  # (bsz, seqlen, vocab_size)
+        if targets is not None:
+            loss = F.cross_entropy(
+                out.view(-1, out.size(-1)), targets.view(-1), ignore_index=-100)
+            return out, loss
+        return out
 
     def predict(self, x, k=1):
         """Generate k tokens autoregressively.
@@ -190,18 +195,14 @@ def predict(self, x, k=1):
         Returns:
             Tuple of (input_ids, predicted_ids)
         """
-        # For debugging
-        predictions = []
-
-        batch_size = x.shape[0]
-        seq_len = x.shape[1]
 
         # Store original input
         original_input = x.clone()
         generated_input = x.clone()
 
         # Generate k tokens autoregressively
         for i in range(k):
+
             # Get logits for the entire sequence
             logits = self(generated_input)
 
@@ -212,24 +213,20 @@ def predict(self, x, k=1):
             # This is a common issue - the model gets stuck repeating the last token
             last_token_id = generated_input[:, -1]
             next_token_logits.scatter_(1, last_token_id.unsqueeze(1), float('-inf'))
-
-            # Print top 5 tokens for debugging
-            if i == 0:
-                print("\nPyTorch detailed prediction:")
-                top5_values, top5_indices = torch.topk(next_token_logits[0], 5)
-                for j, (idx, val) in enumerate(zip(top5_indices.tolist(), top5_values.tolist())):
-                    prob = torch.softmax(next_token_logits[0], dim=-1)[idx].item()
-                    print(f"  Top {j+1}: Token {idx}, logit={val:.2f}, prob={prob:.6f}")
-
+            
             # Get the most likely token
             next_token = torch.argmax(next_token_logits, dim=-1)
-            predictions.append(next_token.item())
 
             # Append the predicted token to the sequence
             next_token = next_token.unsqueeze(1)  # Add sequence dimension
             generated_input = torch.cat([generated_input, next_token], dim=1)
 
-        print(f"  Full predictions step by step: {predictions}")
+        # For debugging, print predictions for the first item in the batch
+        print("\nPyTorch detailed prediction (first item in batch):")
+        predicted_sequence = generated_input[0, -k:].tolist()
+        print(f"  Predicted token IDs: {predicted_sequence}")
+        for i, token_id in enumerate(predicted_sequence):
+            print(f"  Step {i+1}: Predicted token {token_id}")
 
         # Return all tokens, not just the last k
         return original_input, generated_input[:, -k:]
@@ -269,30 +266,43 @@ def count_params(self, non_embedding=True):
 def main():
     print("Initializing transformer model and running forward pass...")
 
-    seq_length = 512
+    seq_length = 1024
 
     # Define model configuration
     config = ModelConfig(
-        vocab_size=32000,  # Common vocab size for tokenizers like BPE or SentencePiece
+        vocab_size=50257,  # Common vocab size for tokenizers like BPE or SentencePiece
         seq_len=seq_length,  # Maximum sequence length
-        dim=768,  # Embedding dimension
+        dim=1024,  # Embedding dimension
         expand=4.0,  # MLP expansion factor
         n_layers=12,  # Number of transformer layers
-        n_heads=12,  # Number of attention heads
+        n_heads=8,  # Number of attention heads
         rmsnorm_eps=1e-6,  # RMSNorm epsilon
         tie_embeddings=True  # Tie embedding and output weights
     )
 
-    def tie_weights(self):
-        self.lm_head.weight = self.embed_tokens.weight
+    # Instantiate the model
+    model = Transformer(config)
+    print(f"Model has {model.count_params():,} parameters.")
 
-    def count_params(self, non_embedding=True):
-        n_params = sum(p.numel() for p in self.parameters())
-        if non_embedding:
-            n_params -= self.embed_tokens.weight.numel()
-            if (not self.lm_head.weight
-                    is self.embed_tokens.weight):  # if no weight tying
-                n_params -= self.lm_head.weight.numel()
-        return n_params
+    # Create some random input data
+    batch_size = 2
+    input_ids = torch.randint(0, config.vocab_size, (batch_size, seq_length))
+
+    # Move data to the same device as the model
+    if torch.cuda.is_available():
+        input_ids = input_ids.cuda()
+
+    # Run a forward pass
+    print(f"Running forward pass with input shape: {input_ids.shape}")
+    logits = model(input_ids)
+    print(f"Output logits shape: {logits.shape}")
 
+    # Run prediction
+    print("Running prediction...")
+    original_input, predicted_ids = model.predict(input_ids[:, :10], k=5)
+    print(f"Original input shape for prediction: {original_input.shape}")
+    print(f"Predicted IDs shape: {predicted_ids.shape}")
+    print(f"Predicted IDs: {predicted_ids}")
 
+if __name__ == "__main__":
+    main()
diff --git a/algoperf/workloads/lm/lm_pytorch/workload.py b/algoperf/workloads/lm/lm_pytorch/workload.py
@@ -14,6 +14,7 @@
   Transformer,
 )
 from algoperf.workloads.lm.workload import BaseLmWorkload
+from algoperf.workloads.lm.input_pipeline import get_data_iter
 
 USE_PYTORCH_DDP, RANK, DEVICE, N_GPUS = pytorch_utils.pytorch_setup()
 
@@ -37,10 +38,11 @@ def init_model_fn(
     cfg = ModelConfig(
         vocab_size=self._vocab_size,
         seq_len=self._seq_len,
-        dim=512,  # Model dimension
-        expand=4,  # MLP expansion factor
-        n_layers=6,  # Number of transformer layers
-        n_heads=8,  # Number of attention heads
+        dim=self._emb_dim,  # Model dimension
+        expand=self._mlp_dim // self._emb_dim,  # MLP expansion factor
+        # FIXME(rka97): fix expansion factor
+        n_layers=self._n_layers,  # Number of transformer layers
+        n_heads=self._n_heads,  # Number of attention heads
         rmsnorm_eps=1e-6,
         tie_embeddings=True
     )
@@ -65,7 +67,7 @@ def model_fn(
       mode: spec.ForwardPassMode,
       rng: spec.RandomState,
       update_batch_norm: bool,
-      dropout_rate: None) -> Tuple[spec.Tensor, spec.ModelAuxiliaryState]:
+      dropout_rate: float = 0.0) -> Tuple[spec.Tensor, spec.ModelAuxiliaryState]:
 
     del model_state, rng, update_batch_norm, dropout_rate
     model = params
@@ -87,10 +89,8 @@ def _build_input_queue(
       num_batches: Optional[int] = None,
       repeat_final_dataset: bool = False) -> Iterator[Dict[str, spec.Tensor]]:
     """Build an input queue for the given split."""
-    from algoperf.workloads.lm.input_pipeline import get_lm_dataset
     local_batch_size = global_batch_size // N_GPUS
-    
-    loader = get_lm_dataset(
+    loader = get_data_iter(
         data_rng=data_rng,
         split=split,
         data_dir=data_dir,
@@ -99,100 +99,54 @@ def _build_input_queue(
     )
     if USE_PYTORCH_DDP:
        loader = islice(loader, RANK, None, N_GPUS)
-    seq_len = self._seq_len
-    weights = None
-
     dtype = torch.int32
-    is_train = split == 'train'
-
     for batch in loader:
-      inputs = batch['inputs']
-      targets = batch['targets']
-
-      if USE_PYTORCH_DDP:
-        if not is_train:
-          # During eval, the batch size of the remainder might be different
-          per_device_batch_size = torch.tensor(
-              targets.shape[0], dtype=dtype, device=DEVICE)
-          dist.broadcast(per_device_batch_size, src=0)
-          local_batch_size = per_device_batch_size.item()
-        # Broadcast to all devices
-        #dist.broadcast(inputs, src=0)
-        #dist.broadcast(targets, src=0)
-
-      if weights is None:
-        weights = torch.ones((local_batch_size, seq_len), device=DEVICE)
       batch = {
-          'inputs': torch.tensor(inputs, device=DEVICE, dtype=dtype),
-          'targets': torch.tensor(targets, device=DEVICE, dtype=dtype),
-          'weights': weights,
+          'inputs': torch.tensor(batch['inputs'], device=DEVICE, dtype=dtype),
+          'targets': torch.tensor(batch['targets'], device=DEVICE, dtype=torch.int64),
+          'weights': None,
       }
       yield batch
 
   def is_output_params(self, param_name: str) -> bool:
     """Return whether the given parameter is an output parameter."""
     return 'lm_head.weight' in param_name or 'lm_head.bias' in param_name
 
-  def _eval_batch(self,
-                  params: spec.ParameterContainer,
-                  batch: Dict[str, spec.Tensor],
-                  model_state: spec.ModelAuxiliaryState,
-                  rng: spec.RandomState) -> spec.Tensor:
-    """Evaluate the model on a single batch."""
-    model = params
-    logits, _ = self.model_fn(
-        model, batch, model_state, spec.ForwardPassMode.EVAL, rng, False)
-
-    # Handle both one-hot and token ID targets
-    targets = batch['targets']
-    if targets.dim() == 3:  # one-hot
-        loss = -torch.sum(targets * torch.nn.functional.log_softmax(logits, dim=-1))
-    else:  # token IDs
-        # TODO(kasimbeg): before deleting make sure we have defined self.weighted_cross_entropy so that we can call the shared workload _eval_batch.
-        loss = torch.nn.functional.cross_entropy(
-            logits.view(-1, logits.size(-1)),
-            targets.view(-1),
-            reduction='sum'
-        )
-    return loss
-    
-  def loss_fn(
-      self,
-      label_batch: spec.Tensor,
-      logits_batch: spec.Tensor,
-      mask_batch: Optional[spec.Tensor] = None,
-      label_smoothing: float = 0.0) -> Dict[str, spec.Tensor]:
+  # FIXME(rka97): Implement label smoothing
+  def compute_weighted_cross_entropy(self, logits: spec.Tensor, labels: spec.Tensor, weights: spec.Tensor, label_smoothing: float = 0.0) -> Dict[str, spec.Tensor]:
     """Compute cross-entropy loss for language modeling in PyTorch."""
-    vocab_size = logits_batch.shape[-1]
+    vocab_size = logits.size(-1)
 
-    if len(label_batch.shape) == len(logits_batch.shape):
+    if len(labels.shape) == len(logits.shape):
       # One-hot labels
-      log_probs = torch.nn.functional.log_softmax(logits_batch, dim=-1)
-      loss = -torch.sum(label_batch * log_probs, dim=-1)
+      log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+      loss = -torch.sum(labels * log_probs, dim=-1)
     else:
       # Dense labels
       loss = torch.nn.functional.cross_entropy(
-          logits_batch,
-          label_batch,
+          logits.view(-1, vocab_size),
+          labels.view(-1),
           reduction='none')
-    if mask_batch is not None:
-      loss = loss * mask_batch
+      loss = loss.view_as(labels)
+
+    if weights is not None:
+      loss = loss * weights
 
-    n_valid = mask_batch.sum() if mask_batch is not None else label_batch.shape[0]
+    n_valid = weights.sum() if weights is not None else torch.tensor(labels.numel(), dtype=torch.float32, device=labels.device)
     return {
         'summed': loss.sum(),
         'n_valid_examples': n_valid,
-        'per_example': loss
+        'per_example': loss,
     }
 
-def _normalize_eval_metrics(
-    self, num_examples: int, total_metrics: Dict[str, Any]
-  ) -> Dict[str, float]:
-    """Normalize eval metrics."""
-    del num_examples
-    if USE_PYTORCH_DDP:
-      for metric in total_metrics.values():
-        dist.all_reduce(metric)
-    total_metrics = {k: v.item() for k, v in total_metrics.items()}
-    eval_denominator = total_metrics.pop('denominator')
-    return jax.tree.map(lambda x: float(x / eval_denominator), total_metrics)
+  def _normalize_eval_metrics(
+      self, num_examples: int, total_metrics: Dict[str, Any]
+    ) -> Dict[str, float]:
+      """Normalize eval metrics."""
+      del num_examples
+      if USE_PYTORCH_DDP:
+        for metric in total_metrics.values():
+          dist.all_reduce(metric)
+      total_metrics = {k: v.item() for k, v in total_metrics.items()}
+      eval_denominator = total_metrics.pop('denominator')
+      return jax.tree.map(lambda x: float(x / eval_denominator), total_metrics)
diff --git a/pyproject.toml b/pyproject.toml
@@ -90,7 +90,7 @@ librispeech_conformer = [
   "pydub==0.25.1",
 ]
 wmt = ["sentencepiece==0.2.0", "tensorflow-text==2.19.0"]
-lm = ["transformers==4.25.4", "datasets==3.6.0"]
+lm = ["transformers==4.26", "datasets==3.6.0"]
 
 # Frameworks
 jax_core_deps = [
diff --git a/submission_runner.py b/submission_runner.py
@@ -784,7 +784,8 @@ def main(_):
     os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:256'
 
   if FLAGS.framework == 'pytorch':
-    pytorch_init(USE_PYTORCH_DDP, RANK, profiler)
+    limit_tf_threads = (base_workload != 'lm')
+    pytorch_init(USE_PYTORCH_DDP, RANK, profiler, limit_tf_threads=limit_tf_threads)
 
   # TODO: remove once issue resolved.
   if FLAGS.pytorch_eval_num_workers != 0:

Original file line number	Diff line number	Diff line change
`@@ -90,7 +90,7 @@ librispeech_conformer = [`
`90`	`90`	`"pydub==0.25.1",`
`91`	`91`	`]`
`92`	`92`	`wmt = ["sentencepiece==0.2.0", "tensorflow-text==2.19.0"]`
`93`		`-lm = ["transformers==4.25.4", "datasets==3.6.0"]`
	`93`	`+lm = ["transformers==4.26", "datasets==3.6.0"]`
`94`	`94`
`95`	`95`	`# Frameworks`
`96`	`96`	`jax_core_deps = [`