add dynamic recordings + minor changes and bug fixes

FlorianDietz · Jul 19, 2024 · 096dc58 · 096dc58
1 parent 2bc8705
commit 096dc58
Show file tree

Hide file tree

Showing 10 changed files with 624 additions and 505 deletions.
diff --git a/.run/server_single_test_0.run.xml b/.run/server_single_test_0.run.xml
@@ -15,7 +15,7 @@
     <option name="ADD_SOURCE_ROOTS" value="true" />
     <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
     <option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/scripts/server.py" />
-    <option name="PARAMETERS" value="--path &quot;/media/remote/ExperimentsVolume/erg/comgra_data/debugging0&quot; --port 9050 --visualization-file &quot;$PROJECT_DIR$/../erg/erg/architectures/comgra_visualization__interactive_2d_classification.py&quot;" />
+    <option name="PARAMETERS" value="--path &quot;/media/remote/ExperimentsVolume7/erg/comgra_data/debugging0&quot; --port 9050 --visualization-file &quot;$PROJECT_DIR$/../erg/erg/architectures/comgra_visualization__interactive_2d_classification.py&quot;" />
     <option name="SHOW_COMMAND_LINE" value="false" />
     <option name="EMULATE_TERMINAL" value="false" />
     <option name="MODULE_MODE" value="false" />

diff --git a/.run/server_single_test_1.run.xml b/.run/server_single_test_1.run.xml
@@ -14,8 +14,8 @@
     <option name="ADD_CONTENT_ROOTS" value="true" />
     <option name="ADD_SOURCE_ROOTS" value="true" />
     <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
-    <option name="SCRIPT_NAME" value="$PROJECT_DIR$/scripts/start_server.py" />
-    <option name="PARAMETERS" value="--path &quot;/media/remote/ExperimentsVolume/erg/comgra_data/debugging1&quot; --port 9051 --visualization-file &quot;$PROJECT_DIR$/../erg/erg/architectures/comgra_visualization__interactive_2d_classification.py&quot;" />
+    <option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/scripts/server.py" />
+    <option name="PARAMETERS" value="--path &quot;/media/remote/ExperimentsVolume7/erg/comgra_data/debugging1&quot; --port 9051 --visualization-file &quot;$PROJECT_DIR$/../erg/erg/architectures/comgra_visualization__interactive_2d_classification.py&quot;" />
     <option name="SHOW_COMMAND_LINE" value="false" />
     <option name="EMULATE_TERMINAL" value="false" />
     <option name="MODULE_MODE" value="false" />

diff --git a/README.md b/README.md
@@ -13,8 +13,8 @@
   - [Finding the Bug](#finding-the-bug)
 - [Other Features](#other-features)
 - [Custom Visualization](#custom-visualization)
+- [Dynamic Recordings](#dynamic-recordings)
 - [Known Issues](#known-issues)
-- [Future Development: Dynamic Recordings](#future-development-dynamic-recordings)
 - [Future Development: Anomaly Detection and Correlation Analysis](#future-development-anomaly-detection-and-correlation-analysis)
 
 ## Overview
@@ -82,7 +82,6 @@ comgra.my_recorder.register_tensor(...)
 # e.g. because of detach() commands or non-differentiable dependencies.
 comgra.my_recorder.add_tensor_connection(...)
 # Call these whenever you apply losses and propagate gradients:
-comgra.my_recorder.start_backward_pass()
 comgra.my_recorder.record_current_gradients(...)
 # Call this whenever you end an iteration:
 comgra.my_recorder.finish_iteration()
@@ -364,6 +363,31 @@ The file `scripts/example_custom_visualization.py` results in the following visu
 | -
 
 
+## Dynamic Recordings
+
+
+By default, you have to decide at the beginning of a training step whether you want comgra to record it. However. it can happen that you only know at the end of a training step if the step was interesting enough to be worth recording, or if any particular part of the batch was more interesting than the rest.
+
+We therefore provide an optional feature to decide retroactively, at the end of the batch, if you want to record, and what parts of the batch you want to record. This could e.g. help with debugging by automatically finding and recording the very first step where gradients start going out of bounds.
+
+Combine this with anomaly detection and comgra will be able to extract the most interesting and informative samples for you and make them easily accessible using selectors.
+
+To use Dynamic Recordings, simply give `None` as the value for `type_of_execution` in the `start_batch()` function. Then, at the end of the batch but before you call `finish_batch()`, call `decide_recording_of_batch()`. This function takes the actual value of `type_of_execution` und which the recording should be saved (if you provide `None` again, the recording will be skipped). It also takes an argument `category_per_sample`, which selects which indices of the batch should be recorded: Each sample in the batch may be assigned a category. Comgra will try to record an equal number of samples from each category.
+
+
+Note that Dynamic Recordings are not used by default because they are more computationally expensive.
+
+<details>
+  <summary><b>Click here to expand: About the slowdown</b></summary>
+
+  When Dynamic Recordings are used, comgra has to apply .requires_grad to all tensors in advance even if it later finds out that no recording is necessary, and this increases runtime.
+
+  If you are using a limited number of different `type_of_execution` values to categorize your batches, then you can mitigate this problem with the function `declare_that_all_different_types_of_execution_have_been_encountered()`, which enables comgra to realize that many training steps will not be recorded anyway since all possible `type_of_execution` they could be assigned to are too recent.
+
+  Additionally, Dynamic Recordings may be slower than normal recordings because of additional code on the user's side if you make use of `category_per_sample`: In order to assign a category to each sample in a batch, it is necessary to transfer data from GPU to CPU, and doing this on every training step is costly. This problem can be mitigated by only running this part of your code if `recording_is_active()` returns True.
+</details>
+
+
 ## Known Issues
 
 
@@ -386,16 +410,6 @@ comgra.my_recorder.add_tensor_connection("my_tensor", b)
 You should also be aware that `requires_grad` is necessary but not sufficient for `add_tensor_connection()` to work: It's possible that a tensor is built from two tensors, one of which has `requires_grad` and the other does not. Comgra won't automatically record the second one in this case and will not notice that anything is amiss. Use `add_tensor_connection()` to manually add connections for these cases.
 
 
-## Future Development: Dynamic Recordings
-
-
-Currently, you have to decide at the beginning of a training step whether you want comgra to record it. However. it can happen that you only know at the end of a training step if the step was interesting enough to be worth recording, or if any particular part of the batch was more interesting than the rest.
-
-We therefore want to make it possible to decide retroactively, at the end of the batch, if you want to record, and what parts of the batch you want to record. This could e.g. help with debugging by automatically finding and recording the very first step where gradients start going out of bounds.
-
-Combine this with anomaly detection and comgra will be able to extract the most interesting and informative samples for you and make them easily accessible using selectors.
-
-
 ## Future Development: Anomaly Detection and Correlation Analysis
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ comgra = ["*.py"]
 scripts = ["*.py"]
 [project]
 name = "comgra"
-version = "0.11.4"
+version = "0.11.5dev1"
 authors = [
     { name="Florian Dietz", email="[email protected]" },
 ]

diff --git a/src/comgra/objects.py b/src/comgra/objects.py
@@ -338,25 +338,32 @@ class DecisionMakerForRecordings(abc.ABC):
     pass
 
     @abc.abstractmethod
-    def is_record_on_this_iteration(self, training_step, type_of_execution):
+    def is_record_on_this_step(self, training_step, type_of_execution):
+        pass
+
+    @abc.abstractmethod
+    def mark_recording_on_this_step(self, training_step, type_of_execution):
         pass
 
 
 @dataclasses.dataclass
 class DecisionMakerForRecordingsHardcoded(DecisionMakerForRecordings):
     fixed_training_steps: set[int]
 
-    def is_record_on_this_iteration(self, training_step, type_of_execution):
+    def is_record_on_this_step(self, training_step, type_of_execution):
         return training_step in self.fixed_training_steps
 
+    def mark_recording_on_this_step(self, training_step, type_of_execution):
+        pass
+
 
 @dataclasses.dataclass
 class DecisionMakerForRecordingsFrequencyPerType(DecisionMakerForRecordings):
     min_training_steps_difference: int
     exponential_backoff_factor: float = 1.0
     identifier_to_last_recorded_step_and_min_difference: Dict = dataclasses.field(default_factory=dict)
 
-    def is_record_on_this_iteration(self, training_step, type_of_execution):
+    def is_record_on_this_step(self, training_step, type_of_execution):
         if type_of_execution is None:
             return False
         assert self.exponential_backoff_factor >= 1.0, self.exponential_backoff_factor
@@ -366,30 +373,12 @@ def is_record_on_this_iteration(self, training_step, type_of_execution):
         if last_recorded_step == training_step:
             return True
         if last_recorded_step is None or training_step >= last_recorded_step + min_difference:
-            min_difference = min_difference * self.exponential_backoff_factor
-            self.identifier_to_last_recorded_step_and_min_difference[type_of_execution] = (training_step, min_difference)
             return True
         return False
 
-
-class DecisionMakerForRecordingsExponentialFalloff(DecisionMakerForRecordings):
-    maximum_number_of_recordings: int
-    current_valid_steps: List
-    current_step_size: int = 1
-
-    def __init__(self, maximum_number_of_recordings, starting_step_size):
-        super().__init__()
-        assert maximum_number_of_recordings > 1
-        self.maximum_number_of_recordings = maximum_number_of_recordings
-        self.current_valid_steps = []
-        self.current_step_size = starting_step_size
-
-    def is_record_on_this_iteration(self, training_step, type_of_execution):
-        if self.current_step_size * (self.maximum_number_of_recordings - 1) < training_step:
-            self.current_step_size *= 2
-        self.current_valid_steps = [
-            k for k in self.current_valid_steps
-            if k % self.current_step_size == 0
-        ]
-        assert len(self.current_valid_steps) <= self.maximum_number_of_recordings
-        return (training_step % self.current_step_size) == 0
+    def mark_recording_on_this_step(self, training_step, type_of_execution):
+        last_recorded_step, min_difference = self.identifier_to_last_recorded_step_and_min_difference.get(
+            type_of_execution, (None, self.min_training_steps_difference)
+        )
+        min_difference = min_difference * self.exponential_backoff_factor
+        self.identifier_to_last_recorded_step_and_min_difference[type_of_execution] = (training_step, min_difference)