ruflab · rufinv · Jan 8, 2026 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025
diff --git a/.gitignore b/.gitignore
@@ -166,3 +166,4 @@ cython_debug/
 .rgignore
 
 .ruff_cache/
+.poetry_cache/
diff --git a/shimmer/__init__.py b/shimmer/__init__.py
@@ -40,6 +40,7 @@
     combine_loss,
 )
 from shimmer.modules.selection import (
+    LearnedAttention,
     RandomSelection,
     SelectionBase,
     SingleDomainSelection,
@@ -103,6 +104,7 @@
     "RandomSelection",
     "SelectionBase",
     "SingleDomainSelection",
+    "LearnedAttention",
     "DomainDesc",
     "RepeatedDataset",
     "ShimmerDataset",

diff --git a/shimmer/cli/ckpt_migration.py b/shimmer/cli/ckpt_migration.py
@@ -10,9 +10,9 @@
 @click.argument(
     "paths",
     nargs=-1,
-    type=click.Path(exists=True, path_type=Path, file_okay=True, dir_okay=False),
+    type=click.Path(exists=True, file_okay=True, dir_okay=False),
 )
-def migrate_ckpt(paths: Sequence[Path]):
+def migrate_ckpt(paths: Sequence[str]):
     """
     Script to migrate a list of checkpoints.
     This can be called with:
@@ -24,4 +24,4 @@ def migrate_ckpt(paths: Sequence[Path]):
     Internally, this calls `shimmer.utils.migrate_model` for each of the given paths.
     """
     for path in paths:
-        migrate_model(path)
+        migrate_model(Path(path))
diff --git a/shimmer/modules/domain.py b/shimmer/modules/domain.py
@@ -185,25 +185,6 @@ def compute_tr_loss(
         """
         return self.compute_loss(pred, target, raw_target)
 
-    def compute_fused_loss(
-        self, pred: torch.Tensor, target: torch.Tensor, raw_target: Any
-    ) -> LossOutput | None:
-        """
-        Computes the loss for fused (fusion). Override if the fused loss is
-        different that the generic loss.
-
-        Args:
-            pred (`torch.Tensor`): prediction of the model
-            target (`torch.Tensor`): target tensor
-            raw_target (`Any`): raw data from the input
-        Results:
-            `LossOutput | None`: LossOuput with training loss and additional metrics.
-                If `None` is returned, this loss will be ignored and will not
-                participate in the total loss; it can be used to deactivate
-                fused loss for this domain.
-        """
-        return self.compute_loss(pred, target, raw_target)
-
     def compute_domain_loss(self, domain: Any) -> LossOutput | None:
         """
         Compute the unimodal domain loss.

diff --git a/shimmer/modules/global_workspace.py b/shimmer/modules/global_workspace.py
@@ -1,3 +1,4 @@
+import warnings
 from collections.abc import Callable, Iterable, Mapping
 from enum import Enum, auto
 from pathlib import Path
@@ -29,6 +30,7 @@
     LossCoefs,
 )
 from shimmer.modules.selection import (
+    LearnedAttention,
     RandomSelection,
     SelectionBase,
     SingleDomainSelection,
@@ -65,7 +67,7 @@ class GWPredictionsBase(TypedDict):
     broadcasts: dict[frozenset[str], dict[str, torch.Tensor]]
     """
     broadcasts predictions of the model for each domain. It contains demi-cycles,
-    translations, and fused.
+    translations.
     """
 
     cycles: dict[frozenset[str], dict[str, torch.Tensor]]
@@ -706,7 +708,7 @@ def __init__(
         )
 
 
-class GlobalWorkspaceFusion(GlobalWorkspaceBase[GWModule, RandomSelection, GWLosses]):
+class GlobalWorkspaceFusion(GlobalWorkspaceBase[GWModule, SelectionBase, GWLosses]):
     """The fusion (with broadcast loss) flavor of GlobalWorkspaceBase.
 
     This is used to simplify a Global Workspace instanciation and only overrides the
@@ -721,6 +723,7 @@ def __init__(
         workspace_dim: int,
         loss_coefs: BroadcastLossCoefs | Mapping[str, float],
         selection_temperature: float = 0.2,
+        selection_mod: SelectionBase | None = None,
         optim_lr: float = 1e-3,
         optim_weight_decay: float = 0.0,
         scheduler_args: SchedulerArgs | None = None,
@@ -748,7 +751,9 @@ def __init__(
             loss_coefs (`BroadcastLossCoefs | Mapping[str, float]`): loss coefs for the
                 losses.
             selection_temperature (`float`): temperature value for the RandomSelection
-                module.
+                module (default selection).
+            selection_mod (`SelectionBase | None`): optional custom selection module.
+                If None (default), uses `RandomSelection`.
             optim_lr (`float`): learning rate
             optim_weight_decay (`float`): weight decay
             scheduler_args (`SchedulerArgs | None`): optimization scheduler's arguments
@@ -772,7 +777,8 @@ def __init__(
                 torch.tensor([1 / 0.07]).log(), "mean", learn_logit_scale
             )
 
-        selection_mod = RandomSelection(selection_temperature)
+        if selection_mod is None:
+            selection_mod = RandomSelection(selection_temperature)
         loss_mod = GWLosses(
             gw_mod, selection_mod, domain_mods, loss_coefs, contrastive_loss
         )
@@ -787,6 +793,60 @@ def __init__(
             scheduler,
         )
 
+    def init_learned_attention(
+        self,
+        head_size: int = 64,
+        per_domain_keys: bool = False,
+        stopgrad: bool = True,
+        key_on_prefusion: bool = True,
+        domain_dims: Mapping[str, int] | None = None,
+    ) -> LearnedAttention:
+        """
+        Initialize and attach a learned content-based attention module.
+
+        This replaces `self.selection_mod` with a `LearnedAttention` configured for
+        the current workspace (uses `workspace_dim` and domain names from
+        `domain_mods`), ensuring its parameters are tracked by Lightning/torch.
+        """
+        warnings.warn(
+            (
+                "LearnedAttention is best used after pretraining the global workspace "
+                "with a simpler selection (e.g., random or single-domain). "
+                "This path is minimally validated; use at your own risk."
+            ),
+            UserWarning,
+            stacklevel=2,
+        )
+        if not key_on_prefusion and not per_domain_keys:
+            raise ValueError(
+                "key_on_prefusion=False requires per_domain_keys=True because "
+                "domain latent dimensions can differ."
+            )
+
+        final_domain_dims = domain_dims
+        if not key_on_prefusion:
+            if final_domain_dims is None:
+                final_domain_dims = {
+                    name: mod.latent_dim for name, mod in self.domain_mods.items()
+                }
+            missing = [d for d in self.domain_mods if d not in final_domain_dims]
+            if missing:
+                raise ValueError(
+                    f"Missing domain_dims for: {', '.join(sorted(missing))}"
+                )
+
+        selection = LearnedAttention(
+            gw_dim=self.workspace_dim,
+            domain_names=self.domain_mods.keys(),
+            head_size=head_size,
+            per_domain_keys=per_domain_keys,
+            stopgrad=stopgrad,
+            key_on_prefusion=key_on_prefusion,
+            domain_dims=final_domain_dims,
+        )
+        self.selection_mod = selection
+        return selection
+
 
 def pretrained_global_workspace(
     checkpoint_path: str | Path,

diff --git a/shimmer/modules/gw_module.py b/shimmer/modules/gw_module.py
@@ -217,7 +217,7 @@ class GWModulePrediction(TypedDict):
     broadcasts: dict[str, torch.Tensor]
     """
     broadcasts predictions of the model for each domain. It contains demi-cycles,
-    translations, and fused.
+    translations.
     """
 
     cycles: dict[str, torch.Tensor]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -166,3 +166,4 @@ cython_debug/
		.rgignore

		.ruff_cache/
		.poetry_cache/