EleutherAI · luciaquirke · Jan 9, 2026 · Dec 12, 2025 · Dec 12, 2025 · Jan 9, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,14 @@
 # CHANGELOG
 
 
+## v0.5.0 (2026-01-08)
+
+### Features
+
+- Add optimizer-aware gradients
+  ([`497edab`](https://github.com/EleutherAI/bergson/commit/497edab8f2ca19d8fcb1d409fbd99452a929584e))
+
+
 ## v0.4.6 (2026-01-06)
 
 ### Bug Fixes

diff --git a/README.md b/README.md
@@ -15,6 +15,9 @@ We view attribution as a counterfactual question: **_If we "unlearned" this trai
 
 # Announcements
 
+**January 2026**
+- [Experimental] Support distributing preconditioners across nodes and devices for VRAM-efficient computation through the GradientCollectorWithDistributedPreconditioners. If you would like this functionality exposed via the CLI please get in touch! https://github.com/EleutherAI/bergson/pull/100
+
 **October 2025**
 - Support bias parameter gradients in linear modules: https://github.com/EleutherAI/bergson/pull/54
 - Support convolution modules: https://github.com/EleutherAI/bergson/pull/50

diff --git a/bergson/__init__.py b/bergson/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.4.6"
+__version__ = "0.5.0"
 
 from .collection import collect_gradients
 from .collector.gradient_collectors import GradientCollector

diff --git a/bergson/__main__.py b/bergson/__main__.py
@@ -14,6 +14,9 @@
 
 def validate_run_path(index_cfg: IndexConfig):
     """Validate the run path."""
+    if index_cfg.distributed.rank != 0:
+        return
+
     for path in [Path(index_cfg.run_path), Path(index_cfg.partial_run_path)]:
         if not path.exists():
             continue

diff --git a/bergson/build.py b/bergson/build.py
@@ -12,15 +12,18 @@
 from bergson.collection import collect_gradients
 from bergson.config import IndexConfig
 from bergson.data import allocate_batches
+from bergson.distributed import launch_distributed_run
 from bergson.utils.utils import assert_type, setup_reproducibility
-from bergson.utils.worker_utils import setup_model_and_peft
-
-from .distributed import launch_distributed_run
-from .utils.worker_utils import create_processor, setup_data_pipeline
+from bergson.utils.worker_utils import (
+    create_processor,
+    setup_data_pipeline,
+    setup_model_and_peft,
+)
 
 
 def build_worker(
     rank: int,
+    local_rank: int,
     world_size: int,
     cfg: IndexConfig,
     ds: Dataset | IterableDataset,
@@ -32,14 +35,16 @@ def build_worker(
     ----------
     rank : int
         Distributed rank / GPU ID for this worker.
+    local_rank : int
+        Local rank / GPU ID for this worker on the node.
     world_size : int
         Total number of workers participating in the run.
     cfg : IndexConfig
         Specifies the model, tokenizer, PEFT adapters, and other settings.
     ds : Dataset | IterableDataset
         The entire dataset to be indexed. A subset is assigned to each worker.
     """
-    torch.cuda.set_device(rank)
+    torch.cuda.set_device(local_rank)
 
     # These should be set by the main process
     if world_size > 1:
@@ -49,14 +54,14 @@ def build_worker(
         dist.init_process_group(
             "nccl",
             init_method=f"tcp://{addr}:{port}",
-            device_id=torch.device(f"cuda:{rank}"),
+            device_id=torch.device(f"cuda:{local_rank}"),
             rank=rank,
             timeout=timedelta(hours=1),
             world_size=world_size,
         )
 
-    model, target_modules = setup_model_and_peft(cfg, rank)
-    processor = create_processor(model, ds, cfg, rank, target_modules)
+    model, target_modules = setup_model_and_peft(cfg)
+    processor = create_processor(model, ds, cfg, target_modules)
 
     attention_cfgs = {module: cfg.attention for module in cfg.split_attention_modules}
 
@@ -119,6 +124,10 @@ def build(index_cfg: IndexConfig):
 
     ds = setup_data_pipeline(index_cfg)
 
-    launch_distributed_run("build", build_worker, [index_cfg, ds])
+    launch_distributed_run(
+        "build", build_worker, [index_cfg, ds], index_cfg.distributed
+    )
 
-    shutil.move(index_cfg.partial_run_path, index_cfg.run_path)
+    rank = index_cfg.distributed.rank
+    if rank == 0:
+        shutil.move(index_cfg.partial_run_path, index_cfg.run_path)
diff --git a/bergson/collection.py b/bergson/collection.py
@@ -24,8 +24,6 @@ def collect_gradients(
     """
     Compute gradients using the hooks specified in the GradientCollector.
     """
-    if attention_cfgs is None:
-        attention_cfgs = {}
     collector = GradientCollector(
         model=model.base_model,  # type: ignore
         cfg=cfg,