GFNOrg · josephdviviano · Sep 29, 2025 · Sep 29, 2025 · Sep 29, 2025 · Sep 29, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,7 @@ tensordict = ">=0.6.1"
 torch = ">=2.6.0"
 torch_geometric = ">=2.6.1"
 dill = ">=0.3.8"
+tokenizers = ">=0.15"
 
 # dev dependencies.
 black = { version = "24.3", optional = true }

diff --git a/src/gfn/chunking/__init__.py b/src/gfn/chunking/__init__.py
diff --git a/src/gfn/chunking/adapters.py b/src/gfn/chunking/adapters.py
@@ -0,0 +1,130 @@
+from __future__ import annotations
+
+from typing import Any, Optional
+
+import torch
+from torch.distributions import Categorical, Distribution
+
+from gfn.chunking.policy import ChunkedPolicy
+from gfn.env import DiscreteEnv
+from gfn.samplers import AdapterContext, EstimatorAdapter
+from gfn.states import DiscreteStates
+
+
+class ChunkedAdapter(EstimatorAdapter):
+    """EstimatorAdapter that produces macro-level distributions using ChunkedPolicy.
+
+    Forward-only in this PR. TODO(backward): support backward chunking by switching
+    stepping and termination criteria to the backward direction.
+    """
+
+    def __init__(self, env: DiscreteEnv, policy: ChunkedPolicy, library: Any) -> None:
+        self.env = env
+        self.policy = policy
+        self.library = library
+        self._is_backward = False  # TODO(backward): allow backward chunking
+
+    @property
+    def is_backward(self) -> bool:
+        return self._is_backward
+
+    def init_context(
+        self,
+        batch_size: int,
+        device: torch.device,
+        conditioning: Optional[torch.Tensor] = None,
+    ) -> AdapterContext:
+        ctx = AdapterContext(
+            batch_size=batch_size, device=device, conditioning=conditioning
+        )
+        ctx.extras["macro_log_probs"] = []  # List[(N,)]
+        return ctx
+
+    def _strict_macro_mask(self, states_active: DiscreteStates) -> torch.Tensor:
+        """Strict mask by simulating each macro sequentially on each active state.
+
+        Invalidates a macro if any sub-action is invalid or if sink is reached before
+        the sequence completes. Guarantees EXIT macro is valid if no macro is valid.
+        """
+        B = states_active.batch_shape[0]
+        N = self.library.n_actions
+        device = states_active.device
+        mask = torch.zeros(B, N, dtype=torch.bool, device=device)
+
+        for b in range(B):
+            s_curr = states_active[b : b + 1]
+            for j, seq in enumerate(self.library.id_to_sequence):
+                ok = True
+                s = s_curr
+                for k, a in enumerate(seq):
+                    a_tensor = self.env.actions_from_tensor(
+                        torch.tensor([[a]], device=device)
+                    )
+                    if not self.env.is_action_valid(s, a_tensor):
+                        ok = False
+                        break
+                    s_next = self.env._step(s, a_tensor)
+                    if s_next.is_sink_state.item() and k != len(seq) - 1:
+                        ok = False
+                        break
+                    s = s_next
+                mask[b, j] = ok
+
+        # Ensure EXIT macro is available when none is valid
+        try:
+            exit_id = self.library.id_to_sequence.index([self.env.exit_action.item()])
+        except ValueError:
+            exit_id = N - 1
+        no_valid = ~mask.any(dim=1)
+        if no_valid.any():
+            mask[no_valid] = False
+            mask[no_valid, exit_id] = True
+        return mask
+
+    def compute(
+        self,
+        states_active: DiscreteStates,
+        ctx: Any,
+        step_mask: torch.Tensor,
+        **policy_kwargs: Any,
+    ) -> tuple[Distribution, Any]:
+        logits = self.policy.forward_logits(states_active)  # (B_active, N)
+        macro_mask = self._strict_macro_mask(states_active)
+        masked_logits = torch.where(
+            macro_mask, logits, torch.full_like(logits, -float("inf"))
+        )
+        dist = Categorical(logits=masked_logits)
+        ctx.current_estimator_output = None
+        return dist, ctx
+
+    def record_step(
+        self,
+        ctx: Any,
+        step_mask: torch.Tensor,
+        sampled_actions: torch.Tensor,
+        dist: Distribution,
+        save_logprobs: bool,
+        save_estimator_outputs: bool,
+    ) -> None:
+        if save_logprobs:
+            lp_masked = dist.log_prob(sampled_actions)
+            step_lp = torch.full((ctx.batch_size,), 0.0, device=ctx.device)
+            step_lp[step_mask] = lp_masked
+            ctx.extras["macro_log_probs"].append(step_lp)
+        # No estimator outputs for macros by default
+        return
+
+    def finalize(self, ctx: Any) -> dict[str, Optional[torch.Tensor]]:
+        out: dict[str, Optional[torch.Tensor]] = {
+            "log_probs": None,
+            "estimator_outputs": None,
+        }
+        macro_log_probs = ctx.extras.get("macro_log_probs", [])
+        if macro_log_probs:
+            out["macro_log_probs"] = torch.stack(macro_log_probs, dim=0)
+        else:
+            out["macro_log_probs"] = None
+        return out
+
+    def get_current_estimator_output(self, ctx: Any):
+        return None
diff --git a/src/gfn/chunking/chunkers.py b/src/gfn/chunking/chunkers.py
@@ -0,0 +1,150 @@
+from __future__ import annotations
+
+import random
+from abc import ABC, abstractmethod
+from collections import Counter
+from typing import TYPE_CHECKING, Any, Hashable, Sequence
+
+from tokenizers import Tokenizer
+from tokenizers.models import BPE, WordPiece
+from tokenizers.trainers import BpeTrainer, WordPieceTrainer
+
+if TYPE_CHECKING:  # Avoid runtime import to break circular deps with env/containers
+    from gfn.containers.trajectories import Trajectories
+
+
+class Chunker(ABC):
+    """Abstract base class for chunkers that propose new vocab tokens.
+
+    Chunkers operate on trajectories and environment context and return
+    a sequence of token keys (any Hashable) to be added to the env vocab.
+    """
+
+    @abstractmethod
+    def propose_tokens(
+        self,
+        env: "Any",
+        trajectories: Trajectories,
+        n_tokens_to_add: int,
+        remove_old: bool,
+    ) -> Sequence[Hashable]:
+        raise NotImplementedError
+
+
+class UniformChunker(Chunker):
+    """Proposes random bigrams of current non-exit tokens as tuples of ints."""
+
+    def propose_tokens(
+        self,
+        env: "Any",
+        trajectories: Trajectories,
+        n_tokens_to_add: int,
+        remove_old: bool,
+    ) -> Sequence[Hashable]:
+        # Build non-exit pool from current vocab ids.
+        non_exit_ids = [i for i in range(env.n_actions) if i != env.exit_token_id]
+        seen = set(env.vocab)
+        out: set[Hashable] = set()
+        while len(out) < n_tokens_to_add and len(out) < 10_000:
+            a, b = random.choice(non_exit_ids), random.choice(non_exit_ids)
+            candidate = (a, b)
+            if candidate not in seen:
+                out.add(candidate)
+        return list(out)
+
+
+class _StringMapping:
+    """Utility to map env keys to strings suitable for tokenizers."""
+
+    def __init__(self, delimiter: str = "") -> None:
+        self.delimiter = delimiter
+
+    def key_to_str(self, key: Hashable) -> str:
+        if isinstance(key, tuple):
+            return self.delimiter.join(str(x) for x in key)
+        return str(key)
+
+
+class BPEChunker(Chunker):
+    def __init__(self, unk_token: str = "[UNK]", delimiter: str = "") -> None:
+        self.unk_token = unk_token
+        self.mapper = _StringMapping(delimiter=delimiter)
+
+    def propose_tokens(
+        self,
+        env: "Any",
+        trajectories: Trajectories,
+        n_tokens_to_add: int,
+        remove_old: bool,
+        min_frequency: int = 5,
+    ) -> Sequence[Hashable]:
+        # Build corpus strings from trajectories via env tokenizer
+        corpus = env.trajectories_to_token_strings(trajectories)
+
+        # Build initial vocab from current env keys mapped to strings
+        vocab_dict = {self.mapper.key_to_str(k): i for i, k in enumerate(env.vocab)}
+        tokenizer = Tokenizer(BPE(vocab_dict, [], unk_token=self.unk_token))
+
+        target_vocab_size = len(env.vocab) - 1 + n_tokens_to_add
+        trainer = BpeTrainer(
+            vocab_size=target_vocab_size,  # type: ignore
+            special_tokens=[self.unk_token],  # type: ignore
+            min_frequency=min_frequency,  # type: ignore
+        )
+        tokenizer.train_from_iterator(corpus, trainer=trainer)
+
+        # Take the most common new tokens.
+        base_vocab = set(vocab_dict.keys())
+        encodings = tokenizer.encode_batch(corpus)
+        counts = Counter()
+        for enc in encodings:
+            for tok in enc.tokens:
+                if tok not in base_vocab and tok != self.unk_token and len(tok) > 0:
+                    counts[tok] += 1
+
+        top_new = [tok for tok, _ in counts.most_common(n_tokens_to_add)]
+        return top_new
+
+
+class WordPieceChunker(Chunker):
+    def __init__(self, unk_token: str = "[UNK]", delimiter: str = "") -> None:
+        self.unk_token = unk_token
+        self.mapper = _StringMapping(delimiter=delimiter)
+
+    def propose_tokens(
+        self,
+        env: "Any",
+        trajectories: Trajectories,
+        n_tokens_to_add: int,
+        remove_old: bool,
+        min_frequency: int = 5,
+    ) -> Sequence[Hashable]:
+        corpus = env.trajectories_to_token_strings(trajectories)
+        vocab_dict = {self.mapper.key_to_str(k): i for i, k in enumerate(env.vocab)}
+        tokenizer = Tokenizer(
+            WordPiece(
+                vocab=vocab_dict,
+                unk_token=self.unk_token,
+                max_input_chars_per_word=100,
+            )
+        )
+        target_vocab_size = len(env.vocab) - 1 + n_tokens_to_add
+        trainer = WordPieceTrainer(
+            vocab_size=target_vocab_size,
+            continuing_subword_prefix="##",  # Defined prefix (removed later).
+            special_tokens=[self.unk_token],
+            min_frequency=min_frequency,
+        )
+        tokenizer.train_from_iterator(corpus, trainer=trainer)
+
+        # Take the most common new tokens.
+        base_vocab = set(vocab_dict.keys())
+        encodings = tokenizer.encode_batch(corpus)
+        counts = Counter()
+        for enc in encodings:
+            for tok in enc.tokens:
+                if tok not in base_vocab and tok != self.unk_token and len(tok) > 0:
+                    counts[tok.lstrip("##")] += 1  # Remove prefix if present.
+
+        top_new = [tok for tok, _ in counts.most_common(n_tokens_to_add)]
+        return top_new