Fixed checkpoint loading with WeightAveraging

Seppo Enarvi · Seppo Enarvi · commit 5deb0bbc01c7 · 2025-04-04T10:22:36.000+03:00
diff --git a/src/lightning/pytorch/callbacks/weight_averaging.py b/src/lightning/pytorch/callbacks/weight_averaging.py
@@ -304,7 +304,10 @@ def on_load_checkpoint(
             average_model_state = {"module." + name: value for name, value in checkpoint["state_dict"].items()}
             average_model_state |= checkpoint["averaging_state"]
             self._average_model.load_state_dict(average_model_state)
-            checkpoint["state_dict"] = checkpoint["current_model_state"]
+            # The current model state has already been loaded from "state_dict" (which contains the average model
+            # weights) at this point, so overwriting "state_dict" in the checkpoint dictionary makes no difference. We
+            # have to reload the model state from "current_model_state".
+            pl_module.load_state_dict(checkpoint["current_model_state"])
         else:
             rank_zero_warn(
                 "The checkpoint was not created with WeightAveraging. Both the current and the average model will be "
diff --git a/tests/tests_pytorch/callbacks/test_weight_averaging.py b/tests/tests_pytorch/callbacks/test_weight_averaging.py
@@ -235,7 +235,7 @@ def test_ema_resume(tmp_path, crash_on_epoch):
     model2 = _train_and_resume(model2, dataset, tmp_path)
 
     for param1, param2 in zip(model1.parameters(), model2.parameters()):
-        assert torch.allclose(param1, param2, atol=0.001)
+        assert torch.allclose(param1, param2)
 
 
 @RunIf(skip_windows=True)