EleutherAI · LouisYRYJ · May 15, 2025 · May 15, 2025 · May 22, 2025 · May 22, 2025
diff --git a/.gitignore b/.gitignore
@@ -161,23 +161,46 @@ dmypy.json
 cython_debug/
 
 # PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  JetBrains specific template is maintained in a separate JetBrains. that can
+#  be found at https://github.com/github//blob/main/Global/JetBrains.
+#  and can be added to the global  or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
+# VS Code
+.vscode/
+
 # Ruff stuff:
 .ruff_cache/
 
 # PyPI configuration file
 .pypirc
 
+# models
+*.pt
+*.pth
+*.safetensors
+*.json
+*.jsonl
+*.txt
+*.arrow
+*.bin
+*.csv
+
+# plots
+*.png
+*.jpg
+*.jpeg
+*.gif
+
+# debugging results
+*.svg
+*.pickle
+# Faiss index files
+*.faiss
 # Local directory for run artifacts
 runs/
 cache/
 
 wandb/
 .vscode/
-
-
diff --git a/bergson/__main__.py b/bergson/__main__.py
@@ -1,11 +1,16 @@
 from simple_parsing import parse
 
-from .build import build_gradient_dataset
+from bergson.distributed import distributed_computing
+
+from .collection import collect_gradients
 from .data import IndexConfig
 
 
 def main():
-    build_gradient_dataset(parse(IndexConfig))
+    distributed_computing(
+        parse(IndexConfig),
+        worker_fn=collect_gradients,
+    )
 
 
 if __name__ == "__main__":

diff --git a/bergson/build.py b/bergson/build.py
diff --git a/bergson/collection.py b/bergson/collection.py
@@ -10,7 +10,7 @@
 from tqdm.auto import tqdm
 from transformers import PreTrainedModel
 
-from .data import create_index, pad_and_tensor
+from .data import IndexConfig, create_index, pad_and_tensor
 from .gradients import (
     AdafactorNormalizer,
     AdamNormalizer,
@@ -24,11 +24,10 @@ def collect_gradients(
     model: PreTrainedModel,
     data: Dataset,
     processor: GradientProcessor,
-    path: str,
     *,
     batches: list[list[int]] | None = None,
-    skip_preconditioners: bool = False,
     target_modules: set[str] | None = None,
+    cfg: IndexConfig,
 ):
     """
     Compute projected gradients using a subset of the dataset.
@@ -54,7 +53,7 @@ def callback(name: str, g: torch.Tensor):
         mod_grads[name] = g.to(device="cpu", dtype=torch.float16, non_blocking=True)
 
         # Compute the outer product of the flattened gradient
-        if not skip_preconditioners:
+        if not cfg.skip_preconditioners:
             g = g.float()
             preconditioner = preconditioners.get(name, None)
             if preconditioner is None:
@@ -73,9 +72,7 @@ def callback(name: str, g: torch.Tensor):
     grad_sizes = {name: math.prod(s) for name, s in collector.shapes().items()}
 
     # Allocate structured space ahead of time for the gradients
-    grad_buffer = create_index(
-        path, num_grads=len(data), grad_sizes=grad_sizes, dtype=np.float16
-    )
+    grad_buffer = create_index(cfg.run_path, num_grads=len(data), grad_sizes=grad_sizes, dtype=np.float16)
 
     per_doc_losses = torch.full(
         (len(data),),
@@ -139,9 +136,9 @@ def callback(name: str, g: torch.Tensor):
             feature=Value("float16"),
             new_fingerprint="loss",
         )
-        data.save_to_disk(path + "/data.hf")
+        data.save_to_disk(cfg.run_path + "/data.hf")
 
-        processor.save(path)
+        processor.save(cfg.run_path)
 
     # Make sure the gradients are written to disk
     grad_buffer.flush()