nvidia-cosmos · jashshah999 · Feb 25, 2026
diff --git a/cosmos_predict2/_src/predict2/text_encoders/reason1.py b/cosmos_predict2/_src/predict2/text_encoders/reason1.py
@@ -231,10 +231,12 @@ def _forward(
     MODIFICATIONS: adding the hidden states to the output.
     """
 
-    def forward(self, tokens, data_batch={}, start_pos: int = 0) -> torch.Tensor:
+    def forward(self, tokens, data_batch=None, start_pos: int = 0) -> torch.Tensor:
         """
         The training step of the model, including the loss computation.
         """
+        if data_batch is None:
+            data_batch = {}
         assert "pixel_values" not in data_batch, "pixel_values should not be in data_batch, use images instead"
         pixel_values = data_batch.get("images", None)
         image_grid_thw = data_batch.get("image_grid_thw", None)

diff --git a/cosmos_predict2/_src/reason1/models/vlm_base.py b/cosmos_predict2/_src/reason1/models/vlm_base.py
@@ -414,7 +414,7 @@ def training_step(
     def build_model(self, model_config):
         raise NotImplementedError
 
-    def forward(self, tokens, data_batch={}, start_pos: int = 0) -> torch.Tensor:
+    def forward(self, tokens, data_batch=None, start_pos: int = 0) -> torch.Tensor:
         """
         The forward pass of the model.
         Returns:

diff --git a/cosmos_predict2/_src/reason1/models/vlm_qwen.py b/cosmos_predict2/_src/reason1/models/vlm_qwen.py
@@ -370,10 +370,12 @@ def _forward(
             logits = DTensor.from_local(logits, device_mesh=self.cp_mesh, placements=[Shard(1)]).full_tensor()
         return logits
 
-    def forward(self, tokens, data_batch={}, start_pos: int = 0) -> torch.Tensor:
+    def forward(self, tokens, data_batch=None, start_pos: int = 0) -> torch.Tensor:
         """
         The training step of the model, including the loss computation.
         """
+        if data_batch is None:
+            data_batch = {}
         assert "pixel_values" not in data_batch, "pixel_values should not be in data_batch, use images instead"
         pixel_values = data_batch.get("images", None)
         image_grid_thw = data_batch.get("image_grid_thw", None)

diff --git a/cosmos_predict2/_src/reason1/models/vlm_qwen_omni.py b/cosmos_predict2/_src/reason1/models/vlm_qwen_omni.py
@@ -268,19 +268,19 @@ def _forward(
     MODIFICATIONS: adding the hidden states to the output.
     """
 
-    def forward(self, tokens, data_batch={}, start_pos: int = 0) -> torch.Tensor:
+    def forward(self, tokens, data_batch=None, start_pos: int = 0) -> torch.Tensor:
         """
         The training step of the model, including the loss computation.
         """
+        if data_batch is None:
+            data_batch = {}
         assert "pixel_values" not in data_batch, "pixel_values should not be in data_batch, use images instead"
         pixel_values = data_batch.get("images", None)
         image_grid_thw = data_batch.get("image_grid_thw", None)
         pixel_values_videos = data_batch.get("videos", None)
         video_grid_thw = data_batch.get("video_grid_thw", None)
         attention_mask = data_batch.get("padding_mask", None)
 
-        attention_mask = data_batch.get("padding_mask", None)
-
         if image_grid_thw is not None:
             assert len(image_grid_thw) == 1, "Only batch=1 is supported for now, due to `get_rope_index`"
             image_grid_thw = image_grid_thw[0]  # 1, N_img, 3 -> N_img, 3