Clarify self.log(..., rank_zero_only=True|False) (#19056)

awaelchli · carmocca · web-flow · commit 90043798e4ae · 2023-11-23T13:02:21.000-05:00
Co-authored-by: Carlos Mocholí &lt;carlossmocholi@gmail.com&gt;
diff --git a/docs/source-pytorch/accelerators/accelerator_prepare.rst b/docs/source-pytorch/accelerators/accelerator_prepare.rst
@@ -121,14 +121,16 @@ It is possible to perform some computation manually and log the reduced result o
         mean = torch.mean(self.all_gather(self.outputs))
         self.outputs.clear()  # free memory
 
-        # When logging only on rank 0, don't forget to add
+        # When you call `self.log` only on rank 0, don't forget to add
         # `rank_zero_only=True` to avoid deadlocks on synchronization.
-        # caveat: monitoring this is unimplemented. see https://github.com/Lightning-AI/lightning/issues/15852
+        # Caveat: monitoring this is unimplemented, see https://github.com/Lightning-AI/lightning/issues/15852
         if self.trainer.is_global_zero:
             self.log("my_reduced_metric", mean, rank_zero_only=True)
 
+
 ----
 
+
 **********************
 Make models pickleable
 **********************
diff --git a/docs/source-pytorch/extensions/logging.rst b/docs/source-pytorch/extensions/logging.rst
@@ -141,7 +141,7 @@ The :meth:`~lightning.pytorch.core.LightningModule.log` method has a few options
 * ``sync_dist_group``: The DDP group to sync across.
 * ``add_dataloader_idx``: If True, appends the index of the current dataloader to the name (when using multiple dataloaders). If False, user needs to give unique names for each dataloader to not mix the values.
 * ``batch_size``: Current batch size used for accumulating logs logged with ``on_epoch=True``. This will be directly inferred from the loaded batch, but for some data structures you might need to explicitly provide it.
-* ``rank_zero_only``: Whether the value will be logged only on rank 0. This will prevent synchronization which would produce a deadlock as not all processes would perform this log call.
+* ``rank_zero_only``: Set this to ``True`` only if you call ``self.log`` explicitly only from rank 0. If ``True`` you won't be able to access or specify this metric in callbacks (e.g. early stopping).
 
 .. list-table:: Default behavior of logging in Callback or LightningModule
    :widths: 50 25 25
diff --git a/docs/source-pytorch/visualize/logging_advanced.rst b/docs/source-pytorch/visualize/logging_advanced.rst
@@ -196,13 +196,26 @@ If set to True, logs will be sent to the progress bar.
 
 rank_zero_only
 ==============
-**Default:** True
+**Default:** False
+
+Tells Lightning if you are calling ``self.log`` from every process (default) or only from rank 0.
+This is for advanced users who want to reduce their metric manually across processes, but still want to benefit from automatic logging via ``self.log``.
 
-Whether the value will be logged only on rank 0. This will prevent synchronization which would produce a deadlock as not all processes would perform this log call.
+- Set ``False`` (default) if you are calling ``self.log`` from every process.
+- Set ``True`` if you are calling ``self.log`` from rank 0 only. Caveat: you won't be able to use this metric as a monitor in callbacks (e.g., early stopping).
 
 .. code-block:: python
 
-  self.log(rank_zero_only=True)
+    # Default
+    self.log(..., rank_zero_only=False)
+
+    # If you call `self.log` on rank 0 only, you need to set `rank_zero_only=True`
+    if self.trainer.global_rank == 0:
+        self.log(..., rank_zero_only=True)
+
+    # DON'T do this, it will cause deadlocks!
+    self.log(..., rank_zero_only=True)
+
 
 ----
 
diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py
@@ -400,8 +400,10 @@ def log(
                 but for some data structures you might need to explicitly provide it.
             metric_attribute: To restore the metric state, Lightning requires the reference of the
                 :class:`torchmetrics.Metric` in your model. This is found automatically if it is a model attribute.
-            rank_zero_only: Whether the value will be logged only on rank 0. This will prevent synchronization which
-                would produce a deadlock as not all processes would perform this log call.
+            rank_zero_only: Tells Lightning if you are calling ``self.log`` from every process (default) or only from
+                rank 0. If ``True``, you won't be able to use this metric as a monitor in callbacks
+                (e.g., early stopping). Warning: Improper use can lead to deadlocks! See
+                :ref:`Advanced Logging <visualize/logging_advanced:rank_zero_only>` for more details.
 
         """
         if self._fabric is not None:
@@ -563,8 +565,10 @@ def log_dict(
                 each dataloader to not mix values.
             batch_size: Current batch size. This will be directly inferred from the loaded batch,
                 but some data structures might need to explicitly provide it.
-            rank_zero_only: Whether the value will be logged only on rank 0. This will prevent synchronization which
-                would produce a deadlock as not all processes would perform this log call.
+            rank_zero_only: Tells Lightning if you are calling ``self.log`` from every process (default) or only from
+                rank 0. If ``True``, you won't be able to use this metric as a monitor in callbacks
+                (e.g., early stopping). Warning: Improper use can lead to deadlocks! See
+                :ref:`Advanced Logging <visualize/logging_advanced:rank_zero_only>` for more details.
 
         """
         if self._fabric is not None: