GFNOrg · josephdviviano · Dec 15, 2025 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/src/gfn/estimators.py b/src/gfn/estimators.py
@@ -1,3 +1,4 @@
+import math
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from typing import Any, Callable, Dict, List, Optional, Protocol, cast, runtime_checkable
@@ -1290,6 +1291,7 @@ def __init__(
         pf_module: nn.Module,
         sigma: float,
         num_discretization_steps: int,
+        n_variance_outputs: int = 0,
     ):
         """Initialize the PinnedBrownianMotionForward.
 
@@ -1305,6 +1307,12 @@ def __init__(
         self.sigma = sigma
         self.num_discretization_steps = num_discretization_steps
         self.dt = 1.0 / self.num_discretization_steps
+        self.n_variance_outputs = n_variance_outputs
+
+    @property
+    def expected_output_dim(self) -> int:
+        # Drift (s_dim) plus optional variance outputs.
+        return self.s_dim + self.n_variance_outputs
 
     def forward(self, input: States) -> torch.Tensor:
         """Forward pass of the module.
@@ -1329,7 +1337,6 @@ def to_probability_distribution(
         states: States,
         module_output: torch.Tensor,
         **policy_kwargs: Any,
-        # TODO: add epsilon-noisy exploration
     ) -> IsotropicGaussian:
         """Transform the output of the module into a IsotropicGaussian distribution,
         which is the distribution of the next states under the pinned Brownian motion
@@ -1339,24 +1346,66 @@ def to_probability_distribution(
             states: The states to use, states.tensor.shape = (*batch_shape, s_dim + 1).
             module_output: The output of the module (actions), as a tensor of shape
                 (*batch_shape, s_dim).
-            **policy_kwargs: Keyword arguments to modify the distribution.
+            **policy_kwargs: Keyword arguments to modify the distribution. Supported
+                keys:
+                - exploration_std: Optional callable or float controlling extra
+                  exploration noise on top of the base diffusion std. The callable
+                  should accept an integer step index and return a non-negative
+                  standard deviation in state space. When provided, the extra noise
+                  is combined in variance-space (logaddexp) with the base diffusion
+                  variance; non-positive exploration is ignored.
 
         Returns:
             A IsotropicGaussian distribution (distribution of the next states)
         """
         assert len(states.batch_shape) == 1, "States must have a batch_shape of length 1"
-        s_curr = states.tensor[:, :-1]
+        # s_curr = states.tensor[:, :-1]
-        # s_curr = states.tensor[:, :-1]
-        # s_curr = states.tensor[:, :-1]
         t_curr = states.tensor[:, [-1]]
 
         module_output = torch.where(
             (1.0 - t_curr) < self.dt * 1e-2,  # sf case; when t_curr is 1.0
-            torch.full_like(s_curr, -float("inf")),  # This is the exit action
+            # torch.full_like(s_curr, -float("inf")),  # This is the exit action
+            torch.full_like(module_output, -float("inf")),  # This is the exit action
             module_output,
         )
 
-        fwd_mean = self.dt * module_output
-        fwd_std = torch.tensor(self.sigma * self.dt**0.5, device=fwd_mean.device)
-        fwd_std = fwd_std.repeat(fwd_mean.shape[0], 1)
+        drift = module_output[..., : self.s_dim]
+        if self.n_variance_outputs > 0:
+            var_part = module_output[..., self.s_dim :]
+            # Reduce extra variance dims to a single scalar (isotropic for now).
+            log_std = var_part.mean(dim=-1, keepdim=True)
+            fwd_std = torch.exp(log_std) * math.sqrt(self.dt)
+        else:
+            fwd_std = torch.tensor(self.sigma * self.dt**0.5, device=drift.device)
+            fwd_std = fwd_std.repeat(drift.shape[0], 1)
+
+        # Match reference behavior: scale diffusion noise (not drift) by t_scale if present.
+        t_scale_factor = getattr(self.module, "t_scale", 1.0)
+        if t_scale_factor != 1.0:
+            fwd_std = fwd_std * math.sqrt(t_scale_factor)
+
+        fwd_mean = self.dt * drift
+
+        # Optional exploration noise: combine variances (quadrature/logaddexp).
+        exploration_std = policy_kwargs.pop("exploration_std", None)
+        exploration_std_t = torch.as_tensor(
+            exploration_std if exploration_std is not None else 0.0,
+            device=fwd_std.device,
+            dtype=fwd_std.dtype,
+        ).clamp(min=0.0)
+
+        # Combine base diffusion variance σ_base^2 with exploration variance σ_expl^2:
+        # σ_combined = sqrt(σ_base^2 + σ_expl^2). torch.compile friendly.
+        base_log_var = 2 * fwd_std.log()  # log(σ_base^2)
-        # Combine base diffusion variance σ_base^2 with exploration variance σ_expl^2:
-        # σ_combined = sqrt(σ_base^2 + σ_expl^2). torch.compile friendly.
-        base_log_var = 2 * fwd_std.log()  # log(σ_base^2)
+        # If there is no positive exploration noise, keep the base diffusion std.
+        # This avoids unnecessary log operations and potential log(0) issues.
+        if exploration_std_t.eq(0).all():
+            return IsotropicGaussian(fwd_mean, fwd_std)
+
+        # Combine base diffusion variance σ_base^2 with exploration variance σ_expl^2:
+        # σ_combined = sqrt(σ_base^2 + σ_expl^2). torch.compile friendly.
+        # Clamp fwd_std to a small positive value before taking the log to avoid
+        # numerical issues when fwd_std is extremely small or zero.
+        safe_fwd_std = fwd_std.clamp_min(1e-12)
+        base_log_var = 2 * safe_fwd_std.log()  # log(σ_base^2)
-        # Combine base diffusion variance σ_base^2 with exploration variance σ_expl^2:
-        # σ_combined = sqrt(σ_base^2 + σ_expl^2). torch.compile friendly.
-        base_log_var = 2 * fwd_std.log()  # log(σ_base^2)
+        # If there is no positive exploration noise, keep the base diffusion std.
+        # This avoids unnecessary log operations and potential log(0) issues.
+        if exploration_std_t.eq(0).all():
+            return IsotropicGaussian(fwd_mean, fwd_std)
+
+        # Combine base diffusion variance σ_base^2 with exploration variance σ_expl^2:
+        # σ_combined = sqrt(σ_base^2 + σ_expl^2). torch.compile friendly.
+        # Clamp fwd_std to a small positive value before taking the log to avoid
+        # numerical issues when fwd_std is extremely small or zero.
+        safe_fwd_std = fwd_std.clamp_min(1e-12)
+        base_log_var = 2 * safe_fwd_std.log()  # log(σ_base^2)
+        extra_log_var = 2 * exploration_std_t.clamp(min=1e-12).log()  # log(σ_expl^2)
+        extra_log_var_tensor = extra_log_var.expand_as(base_log_var)
+        combined_log_var = torch.logaddexp(base_log_var, extra_log_var_tensor)
+        fwd_std = torch.where(
+            exploration_std_t > 0,
+            torch.exp(0.5 * combined_log_var),
+            fwd_std,
+        )
+
         return IsotropicGaussian(fwd_mean, fwd_std)
 
 
@@ -1367,30 +1416,34 @@ def __init__(
         pb_module: nn.Module,
         sigma: float,
         num_discretization_steps: int,
+        n_variance_outputs: int = 0,
+        pb_scale_range: float = 0.1,
     ):
-        """Initialize the PinnedBrownianMotionForward.
+        """Initialize the PinnedBrownianMotionBackward.
 
         Args:
             s_dim: The dimension of the states.
             pb_module: The neural network module to use for the backward policy.
             sigma: The diffusion coefficient parameter for the pinned Brownian motion.
             num_discretization_steps: The number of discretization steps.
+            n_variance_outputs: Number of variance outputs (0=fixed, 1=learned corr).
+            pb_scale_range: Scaling applied to learned corrections (tanh-bounded).
         """
         super().__init__(s_dim=s_dim, module=pb_module, is_backward=True)
 
         # Pinned Brownian Motion related
         self.sigma = sigma
         self.dt = 1.0 / num_discretization_steps
+        self.n_variance_outputs = n_variance_outputs
+        self.pb_scale_range = pb_scale_range
 
-    def forward(self, input: States) -> torch.Tensor:
-        """Forward pass of the module.
-
-        Args:
-            input: The input to the module as states.
+    @property
+    def expected_output_dim(self) -> int:
+        # Drift correction (s_dim) plus optional variance correction outputs.
+        return self.s_dim + self.n_variance_outputs
 
-        Returns:
-            The output of the module, as a tensor of shape (*batch_shape, output_dim).
-        """
+    def forward(self, input: States) -> torch.Tensor:
+        """Forward pass of the module."""
         out = self.module(self.preprocessor(input))
 
         if self.expected_output_dim is not None:
@@ -1411,6 +1464,7 @@ def to_probability_distribution(
         which is the distribution of the previous states under the pinned Brownian motion
         process, possibly controlled by the output of the backward module. If the module
         is a fixed backward module, the `module_output` is a zero vector (no control).
+        Includes optional learned corrections.
 
         Args:
             states: The states to use, states.tensor.shape = (*batch_shape, s_dim + 1).
@@ -1426,14 +1480,30 @@ def to_probability_distribution(
         t_curr = states.tensor[:, [-1]]  # shape: (*batch_shape,)
 
         is_s0 = (t_curr - self.dt) < self.dt * 1e-2  # s0 case; when t_curr - dt is 0.0
-        bwd_mean = torch.where(
+        # Analytic Brownian bridge base
+        # Brownian bridge mean toward 0 at t=0:
+        # E[s_{t-dt} | s_t] = s_t * (1 - dt / t) and collapses to 0 at the start.
+        # Shapes: s_curr (batch, s_dim), t_curr (batch, 1), dt is scalar.
+        base_mean = torch.where(
             is_s0,
-            s_curr,
-            s_curr * self.dt / t_curr,
+            torch.zeros_like(s_curr),
+            s_curr * (1.0 - self.dt / t_curr),
         )
-        bwd_std = torch.where(
+        base_std = torch.where(
             is_s0,
             torch.zeros_like(t_curr),
             self.sigma * (self.dt * (t_curr - self.dt) / t_curr).sqrt(),
         )
+
+        # Optional learned corrections (tanh-bounded); when n_variance_outputs==0, only mean corr.
+        mean_corr = module_output[..., : self.s_dim] * self.pb_scale_range
+        if self.n_variance_outputs > 0 and module_output.shape[-1] >= self.s_dim + 1:
+            log_std_corr = module_output[..., [-1]] * self.pb_scale_range
+            corr_std = torch.exp(log_std_corr)
+        else:
+            corr_std = torch.zeros_like(base_std)
+
+        bwd_mean = base_mean + mean_corr
+        bwd_std = (base_std**2 + corr_std**2).sqrt()
+
         return IsotropicGaussian(bwd_mean, bwd_std)
diff --git a/src/gfn/gflownet/__init__.py b/src/gfn/gflownet/__init__.py
@@ -2,7 +2,11 @@
 from .detailed_balance import DBGFlowNet, ModifiedDBGFlowNet
 from .flow_matching import FMGFlowNet
 from .sub_trajectory_balance import SubTBGFlowNet
-from .trajectory_balance import LogPartitionVarianceGFlowNet, TBGFlowNet
+from .trajectory_balance import (
+    LogPartitionVarianceGFlowNet,
+    RelativeTrajectoryBalanceGFlowNet,
+    TBGFlowNet,
+)
 
 __all__ = [
     "GFlowNet",
@@ -13,5 +17,6 @@
     "FMGFlowNet",
     "SubTBGFlowNet",
     "LogPartitionVarianceGFlowNet",
+    "RelativeTrajectoryBalanceGFlowNet",
     "TBGFlowNet",
 ]
diff --git a/src/gfn/gflownet/base.py b/src/gfn/gflownet/base.py
@@ -11,7 +11,11 @@
 from gfn.estimators import Estimator
 from gfn.samplers import Sampler
 from gfn.states import States
-from gfn.utils.prob_calculations import get_trajectory_pfs_and_pbs
+from gfn.utils.prob_calculations import (
+    get_trajectory_pbs,
+    get_trajectory_pfs,
+    get_trajectory_pfs_and_pbs,
+)
 
 TrainingSampleType = TypeVar("TrainingSampleType", bound=Container)
 
@@ -343,6 +347,32 @@ def get_pfs_and_pbs(
             recalculate_all_logprobs,
         )
 
+    def trajectory_log_probs_forward(
+        self,
+        trajectories: Trajectories,
+        fill_value: float = 0.0,
+        recalculate_all_logprobs: bool = True,
+    ) -> torch.Tensor:
+        """Evaluates forward logprobs only for each trajectory in the batch."""
+        return get_trajectory_pfs(
+            self.pf,
+            trajectories,
+            fill_value=fill_value,
+            recalculate_all_logprobs=recalculate_all_logprobs,
+        )
+
+    def trajectory_log_probs_backward(
+        self,
+        trajectories: Trajectories,
+        fill_value: float = 0.0,
+    ) -> torch.Tensor:
+        """Evaluates backward logprobs only for each trajectory in the batch."""
+        return get_trajectory_pbs(
+            self.pb,
+            trajectories,
+            fill_value=fill_value,
+        )
+
     def get_scores(
         self,
         trajectories: Trajectories,

diff --git a/src/gfn/gflownet/trajectory_balance.py b/src/gfn/gflownet/trajectory_balance.py
@@ -3,6 +3,7 @@
 and the [Log Partition Variance loss](https://arxiv.org/abs/2302.05446).
 """
 
+import math
 from typing import cast
 
 import torch
@@ -16,6 +17,7 @@
     is_callable_exception_handler,
     warn_about_recalculating_logprobs,
 )
+from gfn.utils.prob_calculations import get_trajectory_pfs
 
 
 class TBGFlowNet(TrajectoryBasedGFlowNet):
@@ -132,6 +134,120 @@ def loss(
         return loss
 
 
+class RelativeTrajectoryBalanceGFlowNet(TrajectoryBasedGFlowNet):
+    r"""GFlowNet for the Relative Trajectory Balance (RTB) loss.
+
+    This objective matches a posterior sampler to a prior diffusion (or other
+    sequential) model by minimizing
+
+    .. math::
+
+        \left(\log Z_\phi + \log p_\phi(\tau) - \log p_\theta(\tau)
+              - \beta \log r(x_T)\right)^2,
+
+    where :math:`p_\theta` is a fixed prior process, :math:`p_\phi` is the
+    learnable posterior, :math:`r` is a positive reward/constraint on the
+    terminal state :math:`x_T`, and :math:`\log Z_\phi` is a learned scalar
+    normalizer.
+    """
+
+    def __init__(
+        self,
+        pf: Estimator,
+        prior_pf: Estimator,
+        *,
+        logZ: nn.Parameter | ScalarEstimator | None = None,
+        init_logZ: float = 0.0,
+        beta: float = 1.0,
+        log_reward_clip_min: float = -float("inf"),
+        debug: bool = False,
+    ):
+        """Initializes an RTB GFlowNet.
+
+        Args:
+            pf: Posterior forward policy estimator :math:`p_\\phi`.
+            prior_pf: Fixed prior forward policy estimator :math:`p_\\theta`.
+            logZ: Learnable log-partition parameter or ScalarEstimator for
+                conditional settings. Defaults to a scalar parameter.
+            init_logZ: Initial value for logZ if ``logZ`` is None.
+            beta: Optional scaling applied to the terminal log-reward.
+            log_reward_clip_min: If finite, clips terminal log-rewards.
+            debug: if True, enables extra checks at the cost of execution speed.
+        """
+        super().__init__(
+            pf=pf,
+            pb=None,
+            constant_pb=True,
+            log_reward_clip_min=log_reward_clip_min,
+        )
+        self.prior_pf = prior_pf
+        self.beta = torch.tensor(beta)
+        self.logZ = logZ or nn.Parameter(torch.tensor(init_logZ))
+        self.debug = debug  # TODO: to be passed to base classes.
-        self.debug = debug  # TODO: to be passed to base classes.
+        self.debug = debug
-        self.debug = debug  # TODO: to be passed to base classes.
+        self.debug = debug
+
+    def logz_named_parameters(self) -> dict[str, torch.Tensor]:
+        """Returns named parameters containing 'logZ'."""
+        return {k: v for k, v in dict(self.named_parameters()).items() if "logZ" in k}
+
+    def logz_parameters(self) -> list[torch.Tensor]:
+        """Returns parameters containing 'logZ'."""
+        return [v for k, v in dict(self.named_parameters()).items() if "logZ" in k]
+
+    def loss(
+        self,
+        env: Env,
+        trajectories: Trajectories,
+        recalculate_all_logprobs: bool = True,
+        reduction: str = "mean",
+    ) -> torch.Tensor:
+        """Computes the RTB loss on a batch of trajectories."""
+        del env  # unused
+        warn_about_recalculating_logprobs(trajectories, recalculate_all_logprobs)
+
+        # Posterior log-probs.
+        log_pf_post = self.trajectory_log_probs_forward(
+            trajectories,
+            recalculate_all_logprobs=recalculate_all_logprobs,
+        )
+        log_pf_post = log_pf_post.sum(dim=0)  # Sum along trajectory length.
+
+        # Prior log-probs along the same trajectories.
+        # The prior is fixed; evaluate it without tracking gradients to keep its
+        # parameters out of the RTB optimization graph.
+        with torch.no_grad():
+            log_pf_prior = get_trajectory_pfs(
+                self.prior_pf,
+                trajectories,
+                fill_value=0.0,
+                recalculate_all_logprobs=True,
+            )
+            log_pf_prior = log_pf_prior.sum(dim=0)  # Sum along trajectory length.
+
+        # Get the rewards.
+        log_rewards = trajectories.log_rewards
+        if self.debug:
+            assert log_rewards is not None
+        if math.isfinite(self.log_reward_clip_min):
+            log_rewards = log_rewards.clamp_min(self.log_reward_clip_min)  # type: ignore
+
+        # Get logZ.
+        if trajectories.conditions is not None:
+            with is_callable_exception_handler("logZ", self.logZ):
+                assert isinstance(self.logZ, ScalarEstimator)
+                logZ = self.logZ(trajectories.conditions)
+        else:
+            logZ = self.logZ
+        logZ = cast(torch.Tensor, logZ).squeeze()
+
+        scores = 0.5 * (log_pf_post + logZ - log_pf_prior - self.beta * log_rewards).pow(2)  # type: ignore
+
+        loss = loss_reduce(scores, reduction)  # Reduce across batch dimension.
+        if torch.isnan(loss).any():
+            raise ValueError("loss is nan")
+
+        return loss
+
+
 class LogPartitionVarianceGFlowNet(TrajectoryBasedGFlowNet):
     """GFlowNet for the Log Partition Variance loss.